From 3b8b760232a9672406fab03c25251261ae0704d2 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 27 Jul 2021 12:46:32 -0400 Subject: [PATCH 1/5] Update .clang-format --- cpp/.clang-format | 97 +++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 45 deletions(-) diff --git a/cpp/.clang-format b/cpp/.clang-format index 779ca0033a..0c05436e92 100644 --- a/cpp/.clang-format +++ b/cpp/.clang-format @@ -1,72 +1,78 @@ --- # Refer to the following link for the explanation of each params: -# http://releases.llvm.org/8.0.1/tools/clang/docs/ClangFormatStyleOptions.html -Language: Cpp -# BasedOnStyle: Google +# http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html +Language: Cpp +# BasedOnStyle: Google AccessModifierOffset: -1 AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: false +AlignConsecutiveAssignments: true +AlignConsecutiveBitFields: true AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: true AlignEscapedNewlines: Left -AlignOperands: true +AlignOperands: true AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false -AllowShortCaseLabelsOnASingleLine: false +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortEnumsOnASingleLine: true AllowShortFunctionsOnASingleLine: All AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: true +AllowShortLambdasOnASingleLine: true +AllowShortLoopsOnASingleLine: false # This is deprecated AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: true -BinPackParameters: true +BinPackArguments: false +BinPackParameters: false BraceWrapping: - AfterClass: false + AfterClass: false AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false # disabling the below splits, else, they'll just add to the vertical length of source files! SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false +BreakAfterJavaFieldAnnotations: false BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach +BreakBeforeBraces: WebKit BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeColon -BreakAfterJavaFieldAnnotations: false +BreakInheritanceList: BeforeColon BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' +ColumnLimit: 100 +CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: true # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform ConstructorInitializerIndentWidth: 2 ContinuationIndentWidth: 2 Cpp11BracedListStyle: true -DerivePointerAlignment: true -DisableFormat: false +DerivePointerAlignment: false +DisableFormat: false ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true -ForEachMacros: +ForEachMacros: - foreach - Q_FOREACH - BOOST_FOREACH -IncludeBlocks: Preserve -IncludeCategories: +IncludeBlocks: Preserve +IncludeCategories: - Regex: '^' Priority: 2 - Regex: '^<.*\.h>' @@ -100,9 +106,9 @@ PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left -RawStringFormats: - - Language: Cpp - Delimiters: +RawStringFormats: + - Language: Cpp + Delimiters: - cc - CC - cpp @@ -111,7 +117,7 @@ RawStringFormats: - 'c++' - 'C++' CanonicalDelimiter: '' - - Language: TextProto + - Language: TextProto Delimiters: - pb - PB @@ -126,10 +132,10 @@ RawStringFormats: - ParseTextOrDie - ParseTextProtoOrDie CanonicalDelimiter: '' - BasedOnStyle: google + BasedOnStyle: google # Enabling comment reflow causes doxygen comments to be messed up in their formats! -ReflowComments: false -SortIncludes: true +ReflowComments: true +SortIncludes: true SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterTemplateKeyword: true @@ -139,19 +145,20 @@ SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 2 -SpacesInAngles: false +SpacesInAngles: false +SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false -# We are C++14, but clang-format puts this under `Cpp11` itself -Standard: Cpp11 -StatementMacros: +Standard: c++17 +StatementMacros: - Q_UNUSED - QT_REQUIRE_VERSION # Be consistent with indent-width, even for people who use tab for indentation! -TabWidth: 2 -UseTab: Never -... +TabWidth: 2 +UseTab: Never From cc03dbac0da3a25b51404fec2526c43812982be7 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Tue, 27 Jul 2021 12:47:02 -0400 Subject: [PATCH 2/5] Formatting changes --- cpp/include/raft.hpp | 3 +- cpp/include/raft/cache/cache_util.cuh | 104 +- cpp/include/raft/common/cub_wrappers.cuh | 42 +- .../raft/common/device_loads_stores.cuh | 87 +- cpp/include/raft/common/scatter.cuh | 77 +- cpp/include/raft/comms/comms.hpp | 342 ++-- cpp/include/raft/comms/helper.hpp | 37 +- cpp/include/raft/comms/mpi_comms.hpp | 300 ++-- cpp/include/raft/comms/std_comms.hpp | 328 ++-- cpp/include/raft/comms/test.hpp | 236 ++- cpp/include/raft/comms/ucp_helper.hpp | 138 +- cpp/include/raft/comms/util.hpp | 114 +- cpp/include/raft/cuda_utils.cuh | 259 ++- cpp/include/raft/cudart_utils.h | 190 +- cpp/include/raft/device_atomics.cuh | 265 ++- cpp/include/raft/distance/canberra.cuh | 136 +- cpp/include/raft/distance/chebyshev.cuh | 136 +- cpp/include/raft/distance/cosine.cuh | 175 +- cpp/include/raft/distance/distance.cuh | 520 ++++-- cpp/include/raft/distance/euclidean.cuh | 314 ++-- cpp/include/raft/distance/fused_l2_nn.cuh | 254 ++- cpp/include/raft/distance/hellinger.cuh | 154 +- cpp/include/raft/distance/l1.cuh | 128 +- cpp/include/raft/distance/minkowski.cuh | 139 +- .../raft/distance/pairwise_distance_base.cuh | 159 +- cpp/include/raft/error.hpp | 50 +- cpp/include/raft/handle.hpp | 121 +- cpp/include/raft/integer_utils.h | 55 +- cpp/include/raft/label/classlabels.cuh | 137 +- cpp/include/raft/label/merge_labels.cuh | 31 +- cpp/include/raft/lap/d_structs.h | 20 +- cpp/include/raft/lap/lap.cuh | 161 +- cpp/include/raft/lap/lap_functions.cuh | 399 +++-- cpp/include/raft/lap/lap_kernels.cuh | 343 ++-- cpp/include/raft/linalg/add.cuh | 35 +- cpp/include/raft/linalg/binary_op.cuh | 61 +- .../raft/linalg/cholesky_r1_update.cuh | 63 +- .../raft/linalg/coalesced_reduction.cuh | 55 +- cpp/include/raft/linalg/contractions.cuh | 76 +- cpp/include/raft/linalg/cublas_wrappers.h | 921 +++++++--- cpp/include/raft/linalg/cusolver_wrappers.h | 1144 +++++++++--- cpp/include/raft/linalg/divide.cuh | 7 +- cpp/include/raft/linalg/eig.cuh | 169 +- cpp/include/raft/linalg/eltwise.cuh | 56 +- cpp/include/raft/linalg/gemm.cuh | 85 +- cpp/include/raft/linalg/gemv.h | 54 +- cpp/include/raft/linalg/init.h | 6 +- cpp/include/raft/linalg/lanczos.hpp | 786 +++++--- cpp/include/raft/linalg/map.cuh | 31 +- cpp/include/raft/linalg/map_then_reduce.cuh | 92 +- cpp/include/raft/linalg/matrix_vector_op.cuh | 102 +- .../raft/linalg/mean_squared_error.cuh | 10 +- cpp/include/raft/linalg/multiply.cuh | 7 +- cpp/include/raft/linalg/norm.cuh | 92 +- cpp/include/raft/linalg/qr.cuh | 87 +- cpp/include/raft/linalg/reduce.cuh | 37 +- cpp/include/raft/linalg/strided_reduction.cuh | 74 +- cpp/include/raft/linalg/subtract.cuh | 34 +- cpp/include/raft/linalg/svd.cuh | 238 ++- cpp/include/raft/linalg/transpose.h | 61 +- cpp/include/raft/linalg/unary_op.cuh | 86 +- cpp/include/raft/matrix/math.cuh | 286 ++- cpp/include/raft/matrix/matrix.cuh | 208 ++- cpp/include/raft/mr/buffer_base.hpp | 59 +- cpp/include/raft/mr/device/allocator.hpp | 9 +- cpp/include/raft/mr/device/buffer.hpp | 14 +- cpp/include/raft/mr/host/allocator.hpp | 13 +- cpp/include/raft/mr/host/buffer.hpp | 21 +- cpp/include/raft/random/rng.cuh | 319 ++-- cpp/include/raft/random/rng_impl.cuh | 89 +- cpp/include/raft/sparse/convert/coo.cuh | 20 +- cpp/include/raft/sparse/convert/csr.cuh | 126 +- cpp/include/raft/sparse/convert/dense.cuh | 35 +- cpp/include/raft/sparse/coo.cuh | 192 +- cpp/include/raft/sparse/csr.cuh | 131 +- cpp/include/raft/sparse/cusparse_wrappers.h | 1590 ++++++++++++----- .../raft/sparse/distance/bin_distance.cuh | 189 +- cpp/include/raft/sparse/distance/common.h | 18 +- cpp/include/raft/sparse/distance/coo_spmv.cuh | 118 +- .../coo_spmv_strategies/base_strategy.cuh | 138 +- .../coo_mask_row_iterators.cuh | 166 +- .../dense_smem_strategy.cuh | 104 +- .../coo_spmv_strategies/hash_strategy.cuh | 277 +-- .../distance/detail/coo_spmv_kernel.cuh | 196 +- cpp/include/raft/sparse/distance/distance.cuh | 48 +- .../raft/sparse/distance/ip_distance.cuh | 27 +- .../raft/sparse/distance/l2_distance.cuh | 386 ++-- .../raft/sparse/distance/lp_distance.cuh | 199 ++- .../raft/sparse/distance/operators.cuh | 29 +- cpp/include/raft/sparse/distance/utils.cuh | 6 +- cpp/include/raft/sparse/hierarchy/common.h | 10 +- .../sparse/hierarchy/detail/agglomerative.cuh | 124 +- .../hierarchy/detail/connectivities.cuh | 92 +- .../raft/sparse/hierarchy/detail/mst.cuh | 93 +- .../raft/sparse/hierarchy/single_linkage.hpp | 66 +- cpp/include/raft/sparse/linalg/add.cuh | 116 +- cpp/include/raft/sparse/linalg/degree.cuh | 56 +- cpp/include/raft/sparse/linalg/norm.cuh | 51 +- cpp/include/raft/sparse/linalg/spectral.cuh | 72 +- cpp/include/raft/sparse/linalg/symmetrize.cuh | 157 +- cpp/include/raft/sparse/linalg/transpose.h | 56 +- .../raft/sparse/mst/detail/mst_kernels.cuh | 160 +- .../raft/sparse/mst/detail/mst_solver_inl.cuh | 258 +-- cpp/include/raft/sparse/mst/detail/utils.cuh | 19 +- cpp/include/raft/sparse/mst/mst.cuh | 34 +- cpp/include/raft/sparse/mst/mst_solver.cuh | 48 +- cpp/include/raft/sparse/op/filter.cuh | 115 +- cpp/include/raft/sparse/op/reduce.cuh | 55 +- cpp/include/raft/sparse/op/row_op.cuh | 16 +- cpp/include/raft/sparse/op/slice.h | 34 +- cpp/include/raft/sparse/op/sort.h | 35 +- .../sparse/selection/connect_components.cuh | 224 ++- cpp/include/raft/sparse/selection/knn.cuh | 444 +++-- .../raft/sparse/selection/knn_graph.cuh | 54 +- .../raft/sparse/selection/selection.cuh | 99 +- cpp/include/raft/sparse/utils.h | 22 +- cpp/include/raft/spatial/knn/ann.hpp | 31 +- cpp/include/raft/spatial/knn/ann_common.h | 10 +- .../knn/detail/ann_quantized_faiss.cuh | 141 +- .../raft/spatial/knn/detail/common_faiss.h | 37 +- .../spatial/knn/detail/haversine_distance.cuh | 56 +- .../knn/detail/knn_brute_force_faiss.cuh | 178 +- .../raft/spatial/knn/detail/processing.hpp | 134 +- cpp/include/raft/spatial/knn/knn.hpp | 64 +- cpp/include/raft/spectral/cluster_solvers.hpp | 39 +- cpp/include/raft/spectral/eigen_solvers.hpp | 66 +- cpp/include/raft/spectral/kmeans.hpp | 476 +++-- cpp/include/raft/spectral/lapack.hpp | 552 ++++-- cpp/include/raft/spectral/matrix_wrappers.hpp | 279 +-- .../raft/spectral/modularity_maximization.hpp | 52 +- cpp/include/raft/spectral/partition.hpp | 61 +- cpp/include/raft/spectral/spectral_util.hpp | 125 +- cpp/include/raft/spectral/warn_dbg.hpp | 4 +- cpp/include/raft/stats/mean.cuh | 42 +- cpp/include/raft/stats/mean_center.cuh | 45 +- cpp/include/raft/stats/stddev.cuh | 102 +- cpp/include/raft/stats/sum.cuh | 38 +- cpp/include/raft/vectorized.cuh | 112 +- cpp/test/cluster_solvers.cu | 22 +- cpp/test/cudart_utils.cpp | 3 +- cpp/test/distance/dist_adj.cu | 78 +- cpp/test/distance/dist_canberra.cu | 24 +- cpp/test/distance/dist_chebyshev.cu | 24 +- cpp/test/distance/dist_cos.cu | 23 +- cpp/test/distance/dist_euc_exp.cu | 22 +- cpp/test/distance/dist_euc_unexp.cu | 18 +- cpp/test/distance/dist_hellinger.cu | 24 +- cpp/test/distance/dist_l1.cu | 24 +- cpp/test/distance/dist_minkowski.cu | 23 +- cpp/test/distance/distance_base.cuh | 203 ++- cpp/test/distance/fused_l2_nn.cu | 192 +- cpp/test/eigen_solvers.cu | 35 +- cpp/test/handle.cpp | 21 +- cpp/test/integer_utils.cpp | 6 +- cpp/test/label/label.cu | 31 +- cpp/test/label/merge_labels.cu | 67 +- cpp/test/lap/lap.cu | 92 +- cpp/test/linalg/add.cu | 13 +- cpp/test/linalg/add.cuh | 17 +- cpp/test/linalg/binary_op.cu | 88 +- cpp/test/linalg/binary_op.cuh | 17 +- cpp/test/linalg/cholesky_r1.cu | 50 +- cpp/test/linalg/coalesced_reduction.cu | 60 +- cpp/test/linalg/divide.cu | 50 +- cpp/test/linalg/eig.cu | 177 +- cpp/test/linalg/eig_sel.cu | 92 +- cpp/test/linalg/eltwise.cu | 98 +- cpp/test/linalg/gemm_layout.cu | 63 +- cpp/test/linalg/map.cu | 98 +- cpp/test/linalg/map_then_reduce.cu | 99 +- cpp/test/linalg/matrix_vector_op.cu | 109 +- cpp/test/linalg/matrix_vector_op.cuh | 73 +- cpp/test/linalg/multiply.cu | 30 +- cpp/test/linalg/norm.cu | 140 +- cpp/test/linalg/reduce.cu | 84 +- cpp/test/linalg/reduce.cuh | 59 +- cpp/test/linalg/strided_reduction.cu | 61 +- cpp/test/linalg/subtract.cu | 74 +- cpp/test/linalg/svd.cu | 108 +- cpp/test/linalg/transpose.cu | 51 +- cpp/test/linalg/unary_op.cu | 46 +- cpp/test/linalg/unary_op.cuh | 17 +- cpp/test/matrix/math.cu | 194 +- cpp/test/matrix/matrix.cu | 84 +- cpp/test/mr/device/buffer.cpp | 16 +- cpp/test/mr/host/buffer.cpp | 9 +- cpp/test/mst.cu | 172 +- cpp/test/random/rng.cu | 203 +-- cpp/test/random/rng_int.cu | 66 +- cpp/test/random/sample_without_replacement.cu | 35 +- cpp/test/sparse/add.cu | 97 +- cpp/test/sparse/connect_components.cu | 599 +++---- cpp/test/sparse/convert_coo.cu | 20 +- cpp/test/sparse/convert_csr.cu | 50 +- cpp/test/sparse/csr_row_slice.cu | 80 +- cpp/test/sparse/csr_to_dense.cu | 63 +- cpp/test/sparse/csr_transpose.cu | 80 +- cpp/test/sparse/degree.cu | 23 +- cpp/test/sparse/dist_coo_spmv.cu | 936 +++++----- cpp/test/sparse/distance.cu | 248 ++- cpp/test/sparse/filter.cu | 33 +- cpp/test/sparse/knn.cu | 91 +- cpp/test/sparse/knn_graph.cu | 36 +- cpp/test/sparse/linkage.cu | 647 +++---- cpp/test/sparse/norm.cu | 34 +- cpp/test/sparse/reduce.cu | 50 +- cpp/test/sparse/row_op.cu | 40 +- cpp/test/sparse/selection.cu | 59 +- cpp/test/sparse/sort.cu | 22 +- cpp/test/sparse/symmetrize.cu | 89 +- cpp/test/spatial/haversine.cu | 61 +- cpp/test/spatial/knn.cu | 89 +- cpp/test/spectral_matrix.cu | 13 +- cpp/test/stats/mean.cu | 94 +- cpp/test/stats/mean_center.cu | 63 +- cpp/test/stats/stddev.cu | 46 +- cpp/test/stats/sum.cu | 25 +- cpp/test/test_utils.h | 136 +- 218 files changed, 16429 insertions(+), 11470 deletions(-) diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp index f380d276b2..08f836d3a8 100644 --- a/cpp/include/raft.hpp +++ b/cpp/include/raft.hpp @@ -21,7 +21,8 @@ namespace raft { /* Function for testing RAFT include * * @return message indicating RAFT has been included succesfully*/ -inline std::string test_raft() { +inline std::string test_raft() +{ std::string status = "RAFT Setup succesfully"; return status; } diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh index ce8ef9a095..f63040fa00 100644 --- a/cpp/include/raft/cache/cache_util.cuh +++ b/cpp/include/raft/cache/cache_util.cuh @@ -42,17 +42,15 @@ namespace cache { * @param [out] out vectors collected from the cache, size [n_vec * n] */ template -__global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx, - int n, math_t *out) { +__global__ void get_vecs(const math_t* cache, int n_vec, const int* cache_idx, int n, math_t* out) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - size_t out_col = tid / n_vec; // col idx + size_t out_col = tid / n_vec; // col idx size_t cache_col = cache_idx[out_col]; if (cache_idx[out_col] >= 0) { - if (row + out_col * n_vec < (size_t)n_vec * n) { - out[tid] = cache[row + cache_col * n_vec]; - } + if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; } } } } @@ -84,21 +82,26 @@ __global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx, * @param [in] n_cache_vecs */ template -__global__ void store_vecs(const math_t *tile, int n_tile, int n_vec, - const int *tile_idx, int n, const int *cache_idx, - math_t *cache, int n_cache_vecs) { +__global__ void store_vecs(const math_t* tile, + int n_tile, + int n_vec, + const int* tile_idx, + int n, + const int* cache_idx, + math_t* cache, + int n_cache_vecs) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - int tile_col = tid / n_vec; // col idx - int data_col = tile_idx ? tile_idx[tile_col] : tile_col; + int tile_col = tid / n_vec; // col idx + int data_col = tile_idx ? tile_idx[tile_col] : tile_col; int cache_col = cache_idx[tile_col]; // We ignore negative values. The rest of the checks should be fulfilled // if the cache is used properly if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) { - cache[row + (size_t)cache_col * n_vec] = - tile[row + (size_t)data_col * n_vec]; + cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec]; } } } @@ -121,14 +124,15 @@ int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; } * @return the index of the first element in the array for which * array[idx] >= value. If there is no such value, then return n. */ -int DI arg_first_ge(const int *array, int n, int val) { +int DI arg_first_ge(const int* array, int n, int val) +{ int start = 0; - int end = n - 1; + int end = n - 1; if (array[0] == val) return 0; if (array[end] < val) return n; while (start + 1 < end) { int q = (start + end + 1) / 2; - //invariants: + // invariants: // start < end // start < q <=end // array[start] < val && array[end] <=val @@ -157,7 +161,8 @@ int DI arg_first_ge(const int *array, int n, int val) { * @return the idx of the k-th occurance of val in array, or -1 if * the value is not found. */ -int DI find_nth_occurrence(const int *array, int n, int val, int k) { +int DI find_nth_occurrence(const int* array, int n, int val, int k) +{ int q = arg_first_ge(array, n, val); if (q + k < n && array[q + k] == val) { q += k; @@ -196,10 +201,10 @@ int DI find_nth_occurrence(const int *array, int n, int val, int k) { * Each block should give a different pointer for rank. */ template -DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { +DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank) +{ const int items_per_thread = raft::ceildiv(associativity, nthreads); - typedef cub::BlockRadixSort - BlockRadixSort; + typedef cub::BlockRadixSort BlockRadixSort; __shared__ typename BlockRadixSort::TempStorage temp_storage; int key[items_per_thread]; @@ -208,8 +213,8 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { int block_offset = blockIdx.x * associativity; for (int j = 0; j < items_per_thread; j++) { - int k = threadIdx.x + j * nthreads; - int t = (k < associativity) ? cache_time[block_offset + k] : 32768; + int k = threadIdx.x + j * nthreads; + int t = (k < associativity) ? cache_time[block_offset + k] : 32768; key[j] = t; val[j] = k; } @@ -217,9 +222,7 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { BlockRadixSort(temp_storage).Sort(key, val); for (int j = 0; j < items_per_thread; j++) { - if (val[j] < associativity) { - rank[val[j]] = threadIdx.x * items_per_thread + j; - } + if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; } } __syncthreads(); } @@ -252,9 +255,15 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { * not be cached, size [n] */ template -__global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, - int *cached_keys, int n_cache_sets, - int *cache_time, int time, int *cache_idx) { +__global__ void assign_cache_idx(const int* keys, + int n, + const int* cache_set, + int* cached_keys, + int n_cache_sets, + int* cache_time, + int time, + int* cache_idx) +{ int block_offset = blockIdx.x * associativity; const int items_per_thread = raft::ceildiv(associativity, nthreads); @@ -273,7 +282,7 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, // these elements are assigned -1. for (int j = 0; j < items_per_thread; j++) { - int i = threadIdx.x + j * nthreads; + int i = threadIdx.x + j * nthreads; int t_idx = block_offset + i; bool mask = (i < associativity); // whether this slot is available for writing @@ -284,10 +293,10 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, if (mask) { int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]); if (k > -1) { - int key_val = keys[k]; + int key_val = keys[k]; cached_keys[t_idx] = key_val; - cache_idx[k] = t_idx; - cache_time[t_idx] = time; + cache_idx[k] = t_idx; + cache_time[t_idx] = time; } } } @@ -315,21 +324,28 @@ namespace { * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity] * @param [in] n_cache_sets number of cache sets * @param [in] associativity number of keys in cache set - * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * associativity] + * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * + * associativity] * @param [out] cache_idx cache indices of the working set elements, size [n] * @param [out] is_cached whether the element is cached size[n] * @param [in] time iteration counter (used for time stamping) */ -__global__ void get_cache_idx(int *keys, int n, int *cached_keys, - int n_cache_sets, int associativity, - int *cache_time, int *cache_idx, bool *is_cached, - int time) { +__global__ void get_cache_idx(int* keys, + int n, + int* cached_keys, + int n_cache_sets, + int associativity, + int* cache_time, + int* cache_idx, + bool* is_cached, + int time) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n) { - int widx = keys[tid]; - int sidx = hash(widx, n_cache_sets); - int cidx = sidx * associativity; - int i = 0; + int widx = keys[tid]; + int sidx = hash(widx, n_cache_sets); + int cidx = sidx * associativity; + int i = 0; bool found = false; // search for empty spot and the least recently used spot while (i < associativity && !found) { @@ -338,9 +354,9 @@ __global__ void get_cache_idx(int *keys, int n, int *cached_keys, } is_cached[tid] = found; if (found) { - cidx = cidx + i - 1; - cache_time[cidx] = time; //update time stamp - cache_idx[tid] = cidx; //exact cache idx + cidx = cidx + i - 1; + cache_time[cidx] = time; // update time stamp + cache_idx[tid] = cidx; // exact cache idx } else { cache_idx[tid] = sidx; // assign cache set } diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh index 8d5b29f700..4767c7f254 100644 --- a/cpp/include/raft/common/cub_wrappers.cuh +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -22,28 +22,32 @@ namespace raft { /** - * @brief Convenience wrapper over cub's SortPairs method - * @tparam KeyT key type - * @tparam ValueT value type - * @param workspace workspace buffer which will get resized if not enough space - * @param inKeys input keys array - * @param outKeys output keys array - * @param inVals input values array - * @param outVals output values array - * @param len array length - * @param stream cuda stream - */ + * @brief Convenience wrapper over cub's SortPairs method + * @tparam KeyT key type + * @tparam ValueT value type + * @param workspace workspace buffer which will get resized if not enough space + * @param inKeys input keys array + * @param outKeys output keys array + * @param inVals input values array + * @param outVals output values array + * @param len array length + * @param stream cuda stream + */ template -void sortPairs(raft::mr::device::buffer &workspace, const KeyT *inKeys, - KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, - cudaStream_t stream) { +void sortPairs(raft::mr::device::buffer& workspace, + const KeyT* inKeys, + KeyT* outKeys, + const ValueT* inVals, + ValueT* outVals, + int len, + cudaStream_t stream) +{ size_t worksize; - cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals, - outVals, len, 0, sizeof(KeyT) * 8, stream); + cub::DeviceRadixSort::SortPairs( + nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); workspace.resize(worksize, stream); - cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys, - inVals, outVals, len, 0, sizeof(KeyT) * 8, - stream); + cub::DeviceRadixSort::SortPairs( + workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); } } // namespace raft diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh index bb2b019ecb..41dc9cab08 100644 --- a/cpp/include/raft/common/device_loads_stores.cuh +++ b/cpp/include/raft/common/device_loads_stores.cuh @@ -31,40 +31,43 @@ namespace raft { * @param[out] addr shared memory address (should be aligned to vector size) * @param[in] x data to be stored at this address */ -DI void sts(float* addr, const float& x) { +DI void sts(float* addr, const float& x) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x)); } -DI void sts(float* addr, const float (&x)[1]) { +DI void sts(float* addr, const float (&x)[1]) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0])); } -DI void sts(float* addr, const float (&x)[2]) { +DI void sts(float* addr, const float (&x)[2]) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f32 [%0], {%1, %2};" - : - : "l"(s2), "f"(x[0]), "f"(x[1])); + asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1])); } -DI void sts(float* addr, const float (&x)[4]) { +DI void sts(float* addr, const float (&x)[4]) +{ auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};" : : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3])); } -DI void sts(double* addr, const double& x) { +DI void sts(double* addr, const double& x) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x)); } -DI void sts(double* addr, const double (&x)[1]) { +DI void sts(double* addr, const double (&x)[1]) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0])); } -DI void sts(double* addr, const double (&x)[2]) { +DI void sts(double* addr, const double (&x)[2]) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f64 [%0], {%1, %2};" - : - : "l"(s2), "d"(x[0]), "d"(x[1])); + asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1])); } /** @} */ @@ -80,39 +83,42 @@ DI void sts(double* addr, const double (&x)[2]) { * @param[in] addr shared memory address from where to load * (should be aligned to vector size) */ -DI void lds(float& x, float* addr) { +DI void lds(float& x, float* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1)); } -DI void lds(float (&x)[1], float* addr) { +DI void lds(float (&x)[1], float* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1)); } -DI void lds(float (&x)[2], float* addr) { +DI void lds(float (&x)[2], float* addr) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" - : "=f"(x[0]), "=f"(x[1]) - : "l"(s2)); + asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2)); } -DI void lds(float (&x)[4], float* addr) { +DI void lds(float (&x)[4], float* addr) +{ auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(s4)); } -DI void lds(double& x, double* addr) { +DI void lds(double& x, double* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1)); } -DI void lds(double (&x)[1], double* addr) { +DI void lds(double (&x)[1], double* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1)); } -DI void lds(double (&x)[2], double* addr) { +DI void lds(double (&x)[2], double* addr) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" - : "=d"(x[0]), "=d"(x[1]) - : "l"(s2)); + asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2)); } /** @} */ @@ -123,32 +129,35 @@ DI void lds(double (&x)[2], double* addr) { * @param[out] x data to be loaded from global memory * @param[in] addr address in global memory from where to load */ -DI void ldg(float& x, const float* addr) { +DI void ldg(float& x, const float* addr) +{ asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr)); } -DI void ldg(float (&x)[1], const float* addr) { +DI void ldg(float (&x)[1], const float* addr) +{ asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr)); } -DI void ldg(float (&x)[2], const float* addr) { - asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" - : "=f"(x[0]), "=f"(x[1]) - : "l"(addr)); +DI void ldg(float (&x)[2], const float* addr) +{ + asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr)); } -DI void ldg(float (&x)[4], const float* addr) { +DI void ldg(float (&x)[4], const float* addr) +{ asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(addr)); } -DI void ldg(double& x, const double* addr) { +DI void ldg(double& x, const double* addr) +{ asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr)); } -DI void ldg(double (&x)[1], const double* addr) { +DI void ldg(double (&x)[1], const double* addr) +{ asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr)); } -DI void ldg(double (&x)[2], const double* addr) { - asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" - : "=d"(x[0]), "=d"(x[1]) - : "l"(addr)); +DI void ldg(double (&x)[2], const double* addr) +{ + asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr)); } /** @} */ diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh index 785794461e..b228ac5499 100644 --- a/cpp/include/raft/common/scatter.cuh +++ b/cpp/include/raft/common/scatter.cuh @@ -22,8 +22,8 @@ namespace raft { template -__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, - IdxT len, Lambda op) { +__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op) +{ typedef TxN_t DataVec; typedef TxN_t IdxVec; IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x); @@ -34,61 +34,60 @@ __global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, DataVec dataIn; #pragma unroll for (int i = 0; i < VecLen; ++i) { - auto inPos = idxIn.val.data[i]; + auto inPos = idxIn.val.data[i]; dataIn.val.data[i] = op(in[inPos], tid + i); } dataIn.store(out, tid); } template -void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len, - Lambda op, cudaStream_t stream) { +void scatterImpl( + DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream) +{ const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB); - scatterKernel - <<>>(out, in, idx, len, op); + scatterKernel<<>>(out, in, idx, len, op); CUDA_CHECK(cudaGetLastError()); } /** - * @brief Performs scatter operation based on the input indexing array - * @tparam DataT data type whose array gets scattered - * @tparam IdxT indexing type - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Lambda the device-lambda performing a unary operation on the loaded - * data before it gets scattered - * @param out the output array - * @param in the input array - * @param idx the indexing array - * @param len number of elements in the input array - * @param stream cuda stream where to launch work - * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This - * will be applied to every element before scattering it to the right location. - * The second param in this method will be the destination index. - */ -template , int TPB = 256> -void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len, - cudaStream_t stream, Lambda op = raft::Nop()) { + * @brief Performs scatter operation based on the input indexing array + * @tparam DataT data type whose array gets scattered + * @tparam IdxT indexing type + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Lambda the device-lambda performing a unary operation on the loaded + * data before it gets scattered + * @param out the output array + * @param in the input array + * @param idx the indexing array + * @param len number of elements in the input array + * @param stream cuda stream where to launch work + * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This + * will be applied to every element before scattering it to the right location. + * The second param in this method will be the destination index. + */ +template , int TPB = 256> +void scatter(DataT* out, + const DataT* in, + const IdxT* idx, + IdxT len, + cudaStream_t stream, + Lambda op = raft::Nop()) +{ if (len <= 0) return; - constexpr size_t DataSize = sizeof(DataT); - constexpr size_t IdxSize = sizeof(IdxT); + constexpr size_t DataSize = sizeof(DataT); + constexpr size_t IdxSize = sizeof(IdxT); constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize; - size_t bytes = len * MaxPerElem; + size_t bytes = len * MaxPerElem; if (16 / MaxPerElem && bytes % 16 == 0) { - scatterImpl(out, in, idx, len, - op, stream); + scatterImpl(out, in, idx, len, op, stream); } else if (8 / MaxPerElem && bytes % 8 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (4 / MaxPerElem && bytes % 4 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (2 / MaxPerElem && bytes % 2 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (1 / MaxPerElem) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else { scatterImpl(out, in, idx, len, op, stream); } diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index dc172c9503..72c3b3897e 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -25,16 +25,7 @@ namespace raft { namespace comms { typedef unsigned int request_t; -enum class datatype_t { - CHAR, - UINT8, - INT32, - UINT32, - INT64, - UINT64, - FLOAT32, - FLOAT64 -}; +enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 }; enum class op_t { SUM, PROD, MIN, MAX }; /** @@ -50,42 +41,50 @@ template constexpr datatype_t get_type(); template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::CHAR; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT8; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::INT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::INT64; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT64; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::FLOAT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::FLOAT64; } @@ -95,72 +94,99 @@ class comms_iface { virtual int get_rank() const = 0; virtual std::unique_ptr comm_split(int color, int key) const = 0; - virtual void barrier() const = 0; + virtual void barrier() const = 0; virtual status_t sync_stream(cudaStream_t stream) const = 0; - virtual void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const = 0; + virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0; - virtual void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const = 0; + virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0; virtual void waitall(int count, request_t array_of_requests[]) const = 0; - virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, + virtual void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, cudaStream_t stream) const = 0; - virtual void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const = 0; + virtual void bcast( + void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0; - virtual void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, + virtual void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, cudaStream_t stream) const = 0; - virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const = 0; - - virtual void allgatherv(const void* sendbuf, void* recvbuf, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, cudaStream_t stream) const = 0; + virtual void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const = 0; - virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, int root, + virtual void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const = 0; + + virtual void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, int root, + virtual void gatherv(const void* sendbuf, + void* recvbuf, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void reducescatter(const void* sendbuff, void* recvbuff, - size_t recvcount, datatype_t datatype, op_t op, + virtual void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_send(const void* buf, size_t size, int dest, - cudaStream_t stream) const = 0; + virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_recv(void* buf, size_t size, int source, - cudaStream_t stream) const = 0; - - virtual void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, - void* recvbuf, size_t recvsize, int source, + virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0; + + virtual void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, cudaStream_t stream) const = 0; - virtual void device_multicast_sendrecv( - const void* sendbuf, std::vector const& sendsizes, - std::vector const& sendoffsets, std::vector const& dests, - void* recvbuf, std::vector const& recvsizes, - std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const = 0; + virtual void device_multicast_sendrecv(const void* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + void* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const = 0; }; class comms_t { public: - comms_t(std::unique_ptr impl) : impl_(impl.release()) { + comms_t(std::unique_ptr impl) : impl_(impl.release()) + { ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!"); } @@ -187,7 +213,8 @@ class comms_t { * @param color ranks w/ the same color are placed in the same communicator * @param key controls rank assignment */ - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { return impl_->comm_split(color, key); } @@ -204,9 +231,7 @@ class comms_t { * * @param stream the cuda stream to sync collective operations on */ - status_t sync_stream(cudaStream_t stream) const { - return impl_->sync_stream(stream); - } + status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); } /** * Performs an asynchronous point-to-point send @@ -219,10 +244,9 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void isend(const value_t* buf, size_t size, int dest, int tag, - request_t* request) const { - impl_->isend(static_cast(buf), size * sizeof(value_t), dest, - tag, request); + void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const + { + impl_->isend(static_cast(buf), size * sizeof(value_t), dest, tag, request); } /** @@ -236,10 +260,9 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void irecv(value_t* buf, size_t size, int source, int tag, - request_t* request) const { - impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, - request); + void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const + { + impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, request); } /** @@ -247,7 +270,8 @@ class comms_t { * @param count number of requests to synchronize on * @param array_of_requests an array of request_t objects returned from isend/irecv */ - void waitall(int count, request_t array_of_requests[]) const { + void waitall(int count, request_t array_of_requests[]) const + { impl_->waitall(count, array_of_requests); } @@ -261,11 +285,15 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count, - op_t op, cudaStream_t stream) const { + void allreduce( + const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const + { impl_->allreduce(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), - op, stream); + static_cast(recvbuff), + count, + get_type(), + op, + stream); } /** @@ -277,9 +305,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const { - impl_->bcast(static_cast(buff), count, get_type(), root, - stream); + void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const + { + impl_->bcast(static_cast(buff), count, get_type(), root, stream); } /** @@ -293,11 +321,20 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, - int root, cudaStream_t stream) const { + void reduce(const value_t* sendbuff, + value_t* recvbuff, + size_t count, + op_t op, + int root, + cudaStream_t stream) const + { impl_->reduce(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), op, - root, stream); + static_cast(recvbuff), + count, + get_type(), + op, + root, + stream); } /** @@ -309,11 +346,16 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, - cudaStream_t stream) const { + void allgather(const value_t* sendbuff, + value_t* recvbuff, + size_t sendcount, + cudaStream_t stream) const + { impl_->allgather(static_cast(sendbuff), - static_cast(recvbuff), sendcount, - get_type(), stream); + static_cast(recvbuff), + sendcount, + get_type(), + stream); } /** @@ -328,12 +370,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgatherv(const value_t* sendbuf, value_t* recvbuf, - const size_t* recvcounts, const size_t* displs, - cudaStream_t stream) const { + void allgatherv(const value_t* sendbuf, + value_t* recvbuf, + const size_t* recvcounts, + const size_t* displs, + cudaStream_t stream) const + { impl_->allgatherv(static_cast(sendbuf), - static_cast(recvbuf), recvcounts, displs, - get_type(), stream); + static_cast(recvbuf), + recvcounts, + displs, + get_type(), + stream); } /** @@ -346,11 +394,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, - int root, cudaStream_t stream) const { + void gather(const value_t* sendbuff, + value_t* recvbuff, + size_t sendcount, + int root, + cudaStream_t stream) const + { impl_->gather(static_cast(sendbuff), - static_cast(recvbuff), sendcount, get_type(), - root, stream); + static_cast(recvbuff), + sendcount, + get_type(), + root, + stream); } /** @@ -367,12 +422,22 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount, - const size_t* recvcounts, const size_t* displs, int root, - cudaStream_t stream) const { + void gatherv(const value_t* sendbuf, + value_t* recvbuf, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + int root, + cudaStream_t stream) const + { impl_->gatherv(static_cast(sendbuf), - static_cast(recvbuf), sendcount, recvcounts, displs, - get_type(), root, stream); + static_cast(recvbuf), + sendcount, + recvcounts, + displs, + get_type(), + root, + stream); } /** @@ -384,11 +449,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reducescatter(const value_t* sendbuff, value_t* recvbuff, - size_t recvcount, op_t op, cudaStream_t stream) const { + void reducescatter(const value_t* sendbuff, + value_t* recvbuff, + size_t recvcount, + op_t op, + cudaStream_t stream) const + { impl_->reducescatter(static_cast(sendbuff), - static_cast(recvbuff), recvcount, - get_type(), op, stream); + static_cast(recvbuff), + recvcount, + get_type(), + op, + stream); } /** @@ -403,10 +475,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_send(const value_t* buf, size_t size, int dest, - cudaStream_t stream) const { - impl_->device_send(static_cast(buf), size * sizeof(value_t), - dest, stream); + void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const + { + impl_->device_send(static_cast(buf), size * sizeof(value_t), dest, stream); } /** @@ -421,10 +492,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_recv(value_t* buf, size_t size, int source, - cudaStream_t stream) const { - impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, - stream); + void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const + { + impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, stream); } /** @@ -440,12 +510,21 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_sendrecv(const value_t* sendbuf, size_t sendsize, int dest, - value_t* recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { - impl_->device_sendrecv( - static_cast(sendbuf), sendsize * sizeof(value_t), dest, - static_cast(recvbuf), recvsize * sizeof(value_t), source, stream); + void device_sendrecv(const value_t* sendbuf, + size_t sendsize, + int dest, + value_t* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { + impl_->device_sendrecv(static_cast(sendbuf), + sendsize * sizeof(value_t), + dest, + static_cast(recvbuf), + recvsize * sizeof(value_t), + source, + stream); } /** @@ -463,28 +542,37 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_multicast_sendrecv( - const value_t* sendbuf, std::vector const& sendsizes, - std::vector const& sendoffsets, std::vector const& dests, - value_t* recvbuf, std::vector const& recvsizes, - std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const { - auto sendbytesizes = sendsizes; + void device_multicast_sendrecv(const value_t* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + value_t* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const + { + auto sendbytesizes = sendsizes; auto sendbyteoffsets = sendoffsets; for (size_t i = 0; i < sendsizes.size(); ++i) { sendbytesizes[i] *= sizeof(value_t); sendbyteoffsets[i] *= sizeof(value_t); } - auto recvbytesizes = recvsizes; + auto recvbytesizes = recvsizes; auto recvbyteoffsets = recvoffsets; for (size_t i = 0; i < recvsizes.size(); ++i) { recvbytesizes[i] *= sizeof(value_t); recvbyteoffsets[i] *= sizeof(value_t); } impl_->device_multicast_sendrecv(static_cast(sendbuf), - sendbytesizes, sendbyteoffsets, dests, - static_cast(recvbuf), recvbytesizes, - recvbyteoffsets, sources, stream); + sendbytesizes, + sendbyteoffsets, + dests, + static_cast(recvbuf), + recvbytesizes, + recvbyteoffsets, + sources, + stream); } private: diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index 7b24e31bbe..93e31b4d6a 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -36,9 +36,9 @@ namespace comms { * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, - int num_ranks, int rank) { - auto d_alloc = handle->get_device_allocator(); +void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank) +{ + auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); auto communicator = std::make_shared(std::unique_ptr( @@ -61,40 +61,41 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, - void *ucp_worker, void *eps, int num_ranks, - int rank) { - auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); +void build_comms_nccl_ucx( + handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank) +{ + auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); - auto size_t_ep_arr = reinterpret_cast(eps); + auto size_t_ep_arr = reinterpret_cast(eps); for (int i = 0; i < num_ranks; i++) { - size_t ptr = size_t_ep_arr[i]; - auto ucp_ep_v = reinterpret_cast(*eps_sp); + size_t ptr = size_t_ep_arr[i]; + auto ucp_ep_v = reinterpret_cast(*eps_sp); if (ptr != 0) { auto eps_ptr = reinterpret_cast(size_t_ep_arr[i]); - ucp_ep_v[i] = eps_ptr; + ucp_ep_v[i] = eps_ptr; } else { ucp_ep_v[i] = nullptr; } } - auto d_alloc = handle->get_device_allocator(); + auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); - auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, - num_ranks, rank, d_alloc, stream))); + auto communicator = + std::make_shared(std::unique_ptr(new raft::comms::std_comms( + nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, d_alloc, stream))); handle->set_comms(communicator); } -inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId, - int size) { +inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size) +{ memcpy(id->internal, uniqueId, size); } -inline void get_unique_id(char *uid, int size) { +inline void get_unique_id(char* uid, int size) +{ ncclUniqueId id; ncclGetUniqueId(&id); diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp index 8dda74f0a9..65f38b2625 100644 --- a/cpp/include/raft/comms/mpi_comms.hpp +++ b/cpp/include/raft/comms/mpi_comms.hpp @@ -32,16 +32,16 @@ #include #include -#define MPI_TRY(call) \ - do { \ - int status = call; \ - if (MPI_SUCCESS != status) { \ - int mpi_error_string_lenght = 0; \ - char mpi_error_string[MPI_MAX_ERROR_STRING]; \ - MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - RAFT_EXPECTS(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \ - #call, mpi_error_string); \ - } \ +#define MPI_TRY(call) \ + do { \ + int status = call; \ + if (MPI_SUCCESS != status) { \ + int mpi_error_string_lenght = 0; \ + char mpi_error_string[MPI_MAX_ERROR_STRING]; \ + MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ + RAFT_EXPECTS( \ + MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \ + } \ } while (0) #define MPI_TRY_NO_THROW(call) \ @@ -51,48 +51,41 @@ int mpi_error_string_lenght = 0; \ char mpi_error_string[MPI_MAX_ERROR_STRING]; \ MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - printf("MPI call='%s' at file=%s line=%d failed with %s ", #call, \ - __FILE__, __LINE__, mpi_error_string); \ + printf("MPI call='%s' at file=%s line=%d failed with %s ", \ + #call, \ + __FILE__, \ + __LINE__, \ + mpi_error_string); \ } \ } while (0) namespace raft { namespace comms { -constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) { +constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return MPI_CHAR; - case datatype_t::UINT8: - return MPI_UNSIGNED_CHAR; - case datatype_t::INT32: - return MPI_INT; - case datatype_t::UINT32: - return MPI_UNSIGNED; - case datatype_t::INT64: - return MPI_LONG_LONG; - case datatype_t::UINT64: - return MPI_UNSIGNED_LONG_LONG; - case datatype_t::FLOAT32: - return MPI_FLOAT; - case datatype_t::FLOAT64: - return MPI_DOUBLE; + case datatype_t::CHAR: return MPI_CHAR; + case datatype_t::UINT8: return MPI_UNSIGNED_CHAR; + case datatype_t::INT32: return MPI_INT; + case datatype_t::UINT32: return MPI_UNSIGNED; + case datatype_t::INT64: return MPI_LONG_LONG; + case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG; + case datatype_t::FLOAT32: return MPI_FLOAT; + case datatype_t::FLOAT64: return MPI_DOUBLE; default: // Execution should never reach here. This takes care of compiler warning. return MPI_DOUBLE; } } -constexpr MPI_Op get_mpi_op(const op_t op) { +constexpr MPI_Op get_mpi_op(const op_t op) +{ switch (op) { - case op_t::SUM: - return MPI_SUM; - case op_t::PROD: - return MPI_PROD; - case op_t::MIN: - return MPI_MIN; - case op_t::MAX: - return MPI_MAX; + case op_t::SUM: return MPI_SUM; + case op_t::PROD: return MPI_PROD; + case op_t::MIN: return MPI_MIN; + case op_t::MAX: return MPI_MAX; default: // Execution should never reach here. This takes care of compiler warning. return MPI_MAX; @@ -102,38 +95,35 @@ constexpr MPI_Op get_mpi_op(const op_t op) { class mpi_comms : public comms_iface { public: mpi_comms(MPI_Comm comm, const bool owns_mpi_comm) - : owns_mpi_comm_(owns_mpi_comm), - mpi_comm_(comm), - size_(0), - rank_(1), - next_request_id_(0) { + : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0) + { int mpi_is_initialized = 0; MPI_TRY(MPI_Initialized(&mpi_is_initialized)); RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!"); MPI_TRY(MPI_Comm_size(mpi_comm_, &size_)); MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_)); - //get NCCL unique ID at rank 0 and broadcast it to all others + // get NCCL unique ID at rank 0 and broadcast it to all others ncclUniqueId id; if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id)); MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_)); - //initializing NCCL + // initializing NCCL NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_)); } - virtual ~mpi_comms() { - //finalizing NCCL + virtual ~mpi_comms() + { + // finalizing NCCL NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_)); - if (owns_mpi_comm_) { - MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); - } + if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); } } int get_size() const { return size_; } int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { MPI_Comm new_comm; MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm)); return std::unique_ptr(new mpi_comms(new_comm, true)); @@ -141,15 +131,15 @@ class mpi_comms : public comms_iface { void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); } - void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const { + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const + { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req)); @@ -157,15 +147,15 @@ class mpi_comms : public comms_iface { *request = req_id; } - void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const { + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const + { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } @@ -174,7 +164,8 @@ class mpi_comms : public comms_iface { *request = req_id; } - void waitall(int count, request_t array_of_requests[]) const { + void waitall(int count, request_t array_of_requests[]) const + { std::vector requests; requests.reserve(count); for (int i = 0; i < count; ++i) { @@ -189,87 +180,138 @@ class mpi_comms : public comms_iface { MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE)); } - void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllReduce( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } - void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const + { + NCCL_TRY( + ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), - get_nccl_op(op), root, nccl_comm_, stream)); + void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } - void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllGather( + sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void* sendbuf, void* recvbuf, const size_t* recvcounts, - const size_t* displs, datatype_t datatype, - cudaStream_t stream) const { - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. + void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const + { + // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - + // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < size_; ++root) { - NCCL_TRY(ncclBroadcast(sendbuf, - static_cast(recvbuf) + - displs[root] * get_datatype_size(datatype), - recvcounts[root], get_nccl_datatype(datatype), - root, nccl_comm_, stream)); + NCCL_TRY( + ncclBroadcast(sendbuf, + static_cast(recvbuf) + displs[root] * get_datatype_size(datatype), + recvcounts[root], + get_nccl_datatype(datatype), + root, + nccl_comm_, + stream)); } } - void gather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, int root, cudaStream_t stream) const { + void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, int root, cudaStream_t stream) const { + void gatherv(const void* sendbuff, + void* recvbuff, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, - recvcounts[r], get_nccl_datatype(datatype), r, - nccl_comm_, stream)); + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } - status_t sync_stream(cudaStream_t stream) const { + status_t sync_stream(cudaStream_t stream) const + { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -302,45 +344,58 @@ class mpi_comms : public comms_iface { }; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void* buf, size_t size, int dest, - cudaStream_t stream) const { + void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const + { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void* buf, size_t size, int source, - cudaStream_t stream) const { + void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const + { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, - void* recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { + void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY( - ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } void device_multicast_sendrecv(const void* sendbuf, std::vector const& sendsizes, std::vector const& sendoffsets, - std::vector const& dests, void* recvbuf, + std::vector const& dests, + void* recvbuf, std::vector const& recvsizes, std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const { + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], ncclUint8, sources[i], nccl_comm_, + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -358,9 +413,10 @@ class mpi_comms : public comms_iface { mutable std::unordered_set free_requests_; }; -inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) { - auto communicator = std::make_shared( - std::unique_ptr(new mpi_comms(comm, true))); +inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) +{ + auto communicator = + std::make_shared(std::unique_ptr(new mpi_comms(comm, true))); handle->set_comms(communicator); }; diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 765e8741bb..5f80328d3f 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -62,10 +62,14 @@ class std_comms : public comms_iface { * @param size size of the cluster * @param rank rank of the current worker */ - std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, - std::shared_ptr eps, int num_ranks, int rank, + std_comms(ncclComm_t nccl_comm, + ucp_worker_h ucp_worker, + std::shared_ptr eps, + int num_ranks, + int rank, const std::shared_ptr device_allocator, - cudaStream_t stream, bool subcomms_ucp = true) + cudaStream_t stream, + bool subcomms_ucp = true) : nccl_comm_(nccl_comm), stream_(stream), num_ranks_(num_ranks), @@ -74,7 +78,8 @@ class std_comms : public comms_iface { ucp_worker_(ucp_worker), ucp_eps_(eps), next_request_id_(0), - device_allocator_(device_allocator) { + device_allocator_(device_allocator) + { initialize(); }; @@ -84,7 +89,9 @@ class std_comms : public comms_iface { * @param size size of the cluster * @param rank rank of the current worker */ - std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, + std_comms(const ncclComm_t nccl_comm, + int num_ranks, + int rank, const std::shared_ptr device_allocator, cudaStream_t stream) : nccl_comm_(nccl_comm), @@ -92,37 +99,37 @@ class std_comms : public comms_iface { num_ranks_(num_ranks), rank_(rank), subcomms_ucp_(false), - device_allocator_(device_allocator) { + device_allocator_(device_allocator) + { initialize(); }; - virtual ~std_comms() { + virtual ~std_comms() + { device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); device_allocator_->deallocate(recvbuff_, sizeof(int), stream_); } - void initialize() { - sendbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); - recvbuff_ = reinterpret_cast( - device_allocator_->allocate(sizeof(int), stream_)); + void initialize() + { + sendbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); + recvbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); } int get_size() const { return num_ranks_; } int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { mr::device::buffer d_colors(device_allocator_, stream_, get_size()); mr::device::buffer d_keys(device_allocator_, stream_, get_size()); update_device(d_colors.data() + get_rank(), &color, 1, stream_); update_device(d_keys.data() + get_rank(), &key, 1, stream_); - allgather(d_colors.data() + get_rank(), d_colors.data(), 1, - datatype_t::INT32, stream_); - allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, - stream_); + allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_); + allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_); this->sync_stream(stream_); std::vector h_colors(get_size()); @@ -139,9 +146,7 @@ class std_comms : public comms_iface { for (int i = 0; i < get_size(); ++i) { if (h_colors[i] == color) { subcomm_ranks.push_back(i); - if (ucp_worker_ != nullptr && subcomms_ucp_) { - new_ucx_ptrs.push_back((*ucp_eps_)[i]); - } + if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); } } } @@ -150,8 +155,7 @@ class std_comms : public comms_iface { NCCL_TRY(ncclGetUniqueId(&id)); std::vector requests(subcomm_ranks.size() - 1); for (size_t i = 1; i < subcomm_ranks.size(); ++i) { - isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, - requests.data() + (i - 1)); + isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1)); } waitall(requests.size(), requests.data()); } else { @@ -166,17 +170,23 @@ class std_comms : public comms_iface { NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key)); if (ucp_worker_ != nullptr && subcomms_ucp_) { - auto eps_sp = std::make_shared(new_ucx_ptrs.data()); - return std::unique_ptr(new std_comms( - nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key, - device_allocator_, stream_, subcomms_ucp_)); + auto eps_sp = std::make_shared(new_ucx_ptrs.data()); + return std::unique_ptr(new std_comms(nccl_comm, + (ucp_worker_h)ucp_worker_, + eps_sp, + subcomm_ranks.size(), + key, + device_allocator_, + stream_, + subcomms_ucp_)); } else { - return std::unique_ptr(new std_comms( - nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_)); + return std::unique_ptr( + new std_comms(nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_)); } } - void barrier() const { + void barrier() const + { CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_)); CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_)); @@ -186,39 +196,37 @@ class std_comms : public comms_iface { "ERROR: syncStream failed. This can be caused by a failed rank_."); } - void get_request_id(request_t *req) const { + void get_request_id(request_t* req) const + { request_t req_id; if (this->free_requests_.empty()) req_id = this->next_request_id_++; else { auto it = this->free_requests_.begin(); - req_id = *it; + req_id = *it; this->free_requests_.erase(it); } *req = req_id; } - void isend(const void *buf, size_t size, int dest, int tag, - request_t *request) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); ucp_ep_h ep_ptr = (*ucp_eps_)[dest]; - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); + ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); - this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, - default_tag_mask, get_rank()); + this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank()); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void irecv(void *buf, size_t size, int source, int tag, - request_t *request) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); @@ -226,18 +234,17 @@ class std_comms : public comms_iface { ucp_tag_t tag_mask = default_tag_mask; - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, - tag_mask, source); + ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); + ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void waitall(int count, request_t array_of_requests[]) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void waitall(int count, request_t array_of_requests[]) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); - std::vector requests; + std::vector requests; requests.reserve(count); time_t start = time(NULL); @@ -245,7 +252,8 @@ class std_comms : public comms_iface { for (int i = 0; i < count; ++i) { auto req_it = requests_in_flight_.find(array_of_requests[i]); ASSERT(requests_in_flight_.end() != req_it, - "ERROR: waitall on invalid request: %d", array_of_requests[i]); + "ERROR: waitall on invalid request: %d", + array_of_requests[i]); requests.push_back(req_it->second); free_requests_.insert(req_it->first); requests_in_flight_.erase(req_it); @@ -258,8 +266,7 @@ class std_comms : public comms_iface { // in 10 or more seconds. ASSERT(now - start < 10, "Timed out waiting for requests."); - for (std::vector::iterator it = requests.begin(); - it != requests.end();) { + for (std::vector::iterator it = requests.begin(); it != requests.end();) { bool restart = false; // resets the timeout when any progress was made // Causes UCP to progress through the send/recv message queue @@ -272,10 +279,8 @@ class std_comms : public comms_iface { // If the message needs release, we know it will be sent/received // asynchronously, so we will need to track and verify its state if (req->needs_release) { - ASSERT(UCS_PTR_IS_PTR(req->req), - "UCX Request Error. Request is not valid UCX pointer"); - ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", - UCS_PTR_STATUS(req->req)); + ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer"); + ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req)); ASSERT(req->req->completed == 1 || req->req->completed == 0, "request->completed not a valid value: %d\n", req->req->completed); @@ -296,94 +301,143 @@ class std_comms : public comms_iface { ++it; } // if any progress was made, reset the timeout start time - if (restart) { - start = time(NULL); - } + if (restart) { start = time(NULL); } } } } - void allreduce(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllReduce( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } - void bcast(void *buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const + { + NCCL_TRY( + ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void reduce(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), - get_nccl_op(op), root, nccl_comm_, stream)); + void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } - void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllGather( + sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void *sendbuf, void *recvbuf, const size_t *recvcounts, - const size_t *displs, datatype_t datatype, - cudaStream_t stream) const { - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. + void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const + { + // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - + // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclBroadcast( - sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, - stream)); + NCCL_TRY(ncclBroadcast(sendbuf, + static_cast(recvbuf) + displs[root] * dtype_size, + recvcounts[root], + get_nccl_datatype(datatype), + root, + nccl_comm_, + stream)); } } - void gather(const void *sendbuff, void *recvbuff, size_t sendcount, - datatype_t datatype, int root, cudaStream_t stream) const { + void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount, - const size_t *recvcounts, const size_t *displs, - datatype_t datatype, int root, cudaStream_t stream) const { + void gatherv(const void* sendbuff, + void* recvbuff, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + displs[r] * dtype_size, recvcounts[r], - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } - status_t sync_stream(cudaStream_t stream) const { + status_t sync_stream(cudaStream_t stream) const + { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -416,45 +470,58 @@ class std_comms : public comms_iface { } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void *buf, size_t size, int dest, - cudaStream_t stream) const { + void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const + { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void *buf, size_t size, int source, - cudaStream_t stream) const { + void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const + { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void *sendbuf, size_t sendsize, int dest, - void *recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { + void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY( - ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void device_multicast_sendrecv(const void *sendbuf, - std::vector const &sendsizes, - std::vector const &sendoffsets, - std::vector const &dests, void *recvbuf, - std::vector const &recvsizes, - std::vector const &recvoffsets, - std::vector const &sources, - cudaStream_t stream) const { + void device_multicast_sendrecv(const void* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + void* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { - NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); + NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { - NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], ncclUint8, sources[i], nccl_comm_, + NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -473,10 +540,9 @@ class std_comms : public comms_iface { comms_ucp_handler ucp_handler_; ucp_worker_h ucp_worker_; - std::shared_ptr ucp_eps_; + std::shared_ptr ucp_eps_; mutable request_t next_request_id_; - mutable std::unordered_map - requests_in_flight_; + mutable std::unordered_map requests_in_flight_; mutable std::unordered_set free_requests_; std::shared_ptr device_allocator_; diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 4e95c4eef0..86827a294e 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -37,8 +37,9 @@ namespace comms { * @param the raft handle to use. This is expected to already have an * initialized comms instance. */ -bool test_collective_allreduce(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_allreduce(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = 1; @@ -46,14 +47,12 @@ bool test_collective_allreduce(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - CUDA_CHECK( - cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream); int temp_h = 0; - CUDA_CHECK( - cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -69,8 +68,9 @@ bool test_collective_allreduce(const handle_t &handle, int root) { * @param the raft handle to use. This is expected to already have an * initialized comms instance. */ -bool test_collective_broadcast(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_broadcast(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = root; @@ -80,14 +80,12 @@ bool test_collective_broadcast(const handle_t &handle, int root) { temp_d.resize(1, stream); if (communicator.get_rank() == root) - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.bcast(temp_d.data(), 1, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -97,8 +95,9 @@ bool test_collective_broadcast(const handle_t &handle, int root) { return temp_h == root; } -bool test_collective_reduce(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_reduce(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = root; @@ -107,14 +106,12 @@ bool test_collective_reduce(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -127,8 +124,9 @@ bool test_collective_reduce(const handle_t &handle, int root) { return true; } -bool test_collective_allgather(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_allgather(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = communicator.get_rank(); @@ -137,19 +135,16 @@ bool test_collective_allgather(const handle_t &handle, int root) { raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - communicator.get_size()); + raft::mr::device::buffer recv_d( + handle.get_device_allocator(), stream, communicator.get_size()); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.allgather(temp_d.data(), recv_d.data(), 1, stream); communicator.sync_stream(stream); - int - temp_h[communicator.get_size()]; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), - sizeof(int) * communicator.get_size(), - cudaMemcpyDeviceToHost, stream)); + int temp_h[communicator.get_size()]; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync( + &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -162,8 +157,9 @@ bool test_collective_allgather(const handle_t &handle, int root) { return true; } -bool test_collective_gather(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_gather(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = communicator.get_rank(); @@ -173,20 +169,19 @@ bool test_collective_gather(const handle_t &handle, int root) { temp_d.resize(1, stream); raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, + handle.get_device_allocator(), + stream, communicator.get_rank() == root ? communicator.get_size() : 0); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(communicator.get_size(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), - sizeof(int) * temp_h.size(), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -196,46 +191,48 @@ bool test_collective_gather(const handle_t &handle, int root) { return true; } -bool test_collective_gatherv(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_gatherv(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); std::vector sendcounts(communicator.get_size()); std::iota(sendcounts.begin(), sendcounts.end(), size_t{1}); std::vector displacements(communicator.get_size() + 1, 0); - std::partial_sum(sendcounts.begin(), sendcounts.end(), - displacements.begin() + 1); + std::partial_sum(sendcounts.begin(), sendcounts.end(), displacements.begin() + 1); - std::vector sends(displacements[communicator.get_rank() + 1] - - displacements[communicator.get_rank()], - communicator.get_rank()); + std::vector sends( + displacements[communicator.get_rank() + 1] - displacements[communicator.get_rank()], + communicator.get_rank()); cudaStream_t stream = handle.get_stream(); raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(sends.size(), stream); - raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, - communicator.get_rank() == root ? displacements.back() : 0); + raft::mr::device::buffer recv_d(handle.get_device_allocator(), + stream, + communicator.get_rank() == root ? displacements.back() : 0); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), - sends.size() * sizeof(int), cudaMemcpyHostToDevice, - stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gatherv( - temp_d.data(), recv_d.data(), temp_d.size(), - communicator.get_rank() == root ? sendcounts.data() - : static_cast(nullptr), - communicator.get_rank() == root ? displacements.data() - : static_cast(nullptr), - root, stream); + temp_d.data(), + recv_d.data(), + temp_d.size(), + communicator.get_rank() == root ? sendcounts.data() : static_cast(nullptr), + communicator.get_rank() == root ? displacements.data() : static_cast(nullptr), + root, + stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(displacements.back(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), + CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), + recv_d.data(), sizeof(int) * displacements.back(), - cudaMemcpyDeviceToHost, stream)); + cudaMemcpyDeviceToHost, + stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -249,28 +246,24 @@ bool test_collective_gatherv(const handle_t &handle, int root) { return true; } -bool test_collective_reducescatter(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_reducescatter(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); std::vector sends(communicator.get_size(), 1); cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, - sends.size()); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, - 1); + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, sends.size()); + raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, 1); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), - sends.size() * sizeof(int), cudaMemcpyHostToDevice, - stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); - communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, - stream); + communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -287,9 +280,10 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { * initialized comms instance. * @param number of iterations of all-to-all messaging to perform */ -bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); +bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -298,11 +292,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { std::vector requests; requests.resize(2 * (communicator.get_size() - 1)); int request_idx = 0; - //post receives + // post receives for (int r = 0; r < communicator.get_size(); ++r) { if (r != rank) { - communicator.irecv(received_data.data() + request_idx, 1, r, 0, - requests.data() + request_idx); + communicator.irecv( + received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx); ++request_idx; } } @@ -338,8 +332,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { communicator.barrier(); } - if (communicator.get_rank() == 0) - std::cout << "=========================" << std::endl; + if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl; } return ret; @@ -352,10 +345,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { * initialized comms instance. * @param number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -378,13 +372,9 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { communicator.sync_stream(stream); - if (!sender && received_data.value(stream) != rank - 1) { - ret = false; - } + if (!sender && received_data.value(stream) != rank - 1) { ret = false; } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -397,10 +387,11 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { * initialized comms instance. * @param number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -414,12 +405,12 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { if (rank % 2 == 0) { if (rank + 1 < communicator.get_size()) { - communicator.device_sendrecv(sent_data.data(), 1, rank + 1, - received_data.data(), 1, rank + 1, stream); + communicator.device_sendrecv( + sent_data.data(), 1, rank + 1, received_data.data(), 1, rank + 1, stream); } } else { - communicator.device_sendrecv(sent_data.data(), 1, rank - 1, - received_data.data(), 1, rank - 1, stream); + communicator.device_sendrecv( + sent_data.data(), 1, rank - 1, received_data.data(), 1, rank - 1, stream); } communicator.sync_stream(stream); @@ -429,9 +420,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { ret = false; } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -444,11 +433,11 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { * initialized comms instance. * @param number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, - int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -471,25 +460,26 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, std::vector srcs(communicator.get_size()); std::iota(srcs.begin(), srcs.end(), int{0}); - communicator.device_multicast_sendrecv( - sent_data.data(), sendsizes, sendoffsets, dests, received_data.data(), - recvsizes, recvoffsets, srcs, stream); + communicator.device_multicast_sendrecv(sent_data.data(), + sendsizes, + sendoffsets, + dests, + received_data.data(), + recvsizes, + recvoffsets, + srcs, + stream); communicator.sync_stream(stream); std::vector h_received_data(communicator.get_size()); - raft::update_host(h_received_data.data(), received_data.data(), - received_data.size(), stream); + raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream); CUDA_TRY(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); ++i) { - if (h_received_data[i] != i) { - ret = false; - } + if (h_received_data[i] != i) { ret = false; } } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -502,20 +492,20 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, * initialized comms instance. * @param n_colors number of different colors to test */ -bool test_commsplit(const handle_t &h, int n_colors) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - int const size = communicator.get_size(); +bool test_commsplit(const handle_t& h, int n_colors) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + int const size = communicator.get_size(); if (n_colors > size) n_colors = size; // first we need to assign to a color, then assign the rank within the color int color = rank % n_colors; - int key = rank / n_colors; + int key = rank / n_colors; handle_t new_handle(1); - auto shared_comm = - std::make_shared(communicator.comm_split(color, key)); + auto shared_comm = std::make_shared(communicator.comm_split(color, key)); new_handle.set_comms(shared_comm); return test_collective_allreduce(new_handle, 0); diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index 226b6f0527..89c7b25630 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -25,16 +25,19 @@ namespace raft { namespace comms { -typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); -typedef void (*dlsym_rec_free)(void *); +typedef void (*dlsym_print_info)(ucp_ep_h, FILE*); +typedef void (*dlsym_rec_free)(void*); typedef int (*dlsym_worker_progress)(ucp_worker_h); -typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t, - ucp_datatype_t, ucp_tag_t, - ucp_send_callback_t); -typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count, - ucp_datatype_t datatype, ucp_tag_t, - ucp_tag_t, ucp_tag_recv_callback_t); +typedef ucs_status_ptr_t (*dlsym_send)( + ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t); +typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, + void*, + size_t count, + ucp_datatype_t datatype, + ucp_tag_t, + ucp_tag_t, + ucp_tag_recv_callback_t); /** * Standard UCX request object that will be passed @@ -55,9 +58,9 @@ struct ucx_context { */ class ucp_request { public: - struct ucx_context *req; - bool needs_release = true; - int other_rank = -1; + struct ucx_context* req; + bool needs_release = true; + int other_rank = -1; bool is_send_request = false; }; @@ -67,18 +70,19 @@ static const ucp_tag_t default_tag_mask = -1; /** * @brief Asynchronous send callback sets request to completed */ -static void send_callback(void *request, ucs_status_t status) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; +static void send_callback(void* request, ucs_status_t status) +{ + struct ucx_context* context = (struct ucx_context*)request; + context->completed = 1; } /** * @brief Asynchronous recv callback sets request to completed */ -static void recv_callback(void *request, ucs_status_t status, - ucp_tag_recv_info_t *info) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; +static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_t* info) +{ + struct ucx_context* context = (struct ucx_context*)request; + context->completed = 1; } /** @@ -87,7 +91,8 @@ static void recv_callback(void *request, ucs_status_t status, */ class comms_ucp_handler { public: - comms_ucp_handler() { + comms_ucp_handler() + { load_ucp_handle(); load_send_func(); load_recv_func(); @@ -99,7 +104,7 @@ class comms_ucp_handler { ~comms_ucp_handler() { dlclose(ucp_handle); } private: - void *ucp_handle; + void* ucp_handle; dlsym_print_info print_info_func; dlsym_rec_free req_free_func; @@ -107,7 +112,8 @@ class comms_ucp_handler { dlsym_send send_func; dlsym_recv recv_func; - void load_ucp_handle() { + void load_ucp_handle() + { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE); if (!ucp_handle) { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE); @@ -117,51 +123,56 @@ class comms_ucp_handler { dlerror(); } - void assert_dlerror() { - char *error = dlerror(); + void assert_dlerror() + { + char* error = dlerror(); ASSERT(error == NULL, "Error loading function symbol: %s\n", error); } - void load_send_func() { + void load_send_func() + { send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb"); assert_dlerror(); } - void load_free_req_func() { + void load_free_req_func() + { req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free"); assert_dlerror(); } - void load_print_info_func() { + void load_print_info_func() + { print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info"); assert_dlerror(); } - void load_worker_progress_func() { - worker_progress_func = - (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); + void load_worker_progress_func() + { + worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); assert_dlerror(); } - void load_recv_func() { + void load_recv_func() + { recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb"); assert_dlerror(); } - ucp_tag_t build_message_tag(int rank, int tag) const { + ucp_tag_t build_message_tag(int rank, int tag) const + { // keeping the rank in the lower bits enables debugging. return ((uint32_t)tag << 31) | (uint32_t)rank; } public: - int ucp_progress(ucp_worker_h worker) const { - return (*(worker_progress_func))(worker); - } + int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); } /** * @brief Frees any memory underlying the given ucp request object */ - void free_ucp_request(ucp_request *request) const { + void free_ucp_request(ucp_request* request) const + { if (request->needs_release) { request->req->completed = 0; (*(req_free_func))(request->req); @@ -172,56 +183,67 @@ class comms_ucp_handler { /** * @brief Asynchronously send data to the given endpoint using the given tag */ - void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, - size_t size, int tag, ucp_tag_t tag_mask, int rank) const { + void ucp_isend(ucp_request* req, + ucp_ep_h ep_ptr, + const void* buf, + size_t size, + int tag, + ucp_tag_t tag_mask, + int rank) const + { ucp_tag_t ucp_tag = build_message_tag(rank, tag); - ucs_status_ptr_t send_result = (*(send_func))( - ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); - struct ucx_context *ucp_req = (struct ucx_context *)send_result; + ucs_status_ptr_t send_result = + (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); + struct ucx_context* ucp_req = (struct ucx_context*)send_result; if (UCS_PTR_IS_ERR(send_result)) { ASSERT(!UCS_PTR_IS_ERR(send_result), "unable to send UCX data message (%d)\n", UCS_PTR_STATUS(send_result)); /** - * If the request didn't fail, but it's not OK, it is in flight. - * Expect the handler to be invoked - */ + * If the request didn't fail, but it's not OK, it is in flight. + * Expect the handler to be invoked + */ } else if (UCS_PTR_STATUS(send_result) != UCS_OK) { /** - * If the request is OK, it's already been completed and we don't need to wait on it. - * The request will be a nullptr, however, so we need to create a new request - * and set it to completed to make the "waitall()" function work properly. - */ + * If the request is OK, it's already been completed and we don't need to wait on it. + * The request will be a nullptr, however, so we need to create a new request + * and set it to completed to make the "waitall()" function work properly. + */ req->needs_release = true; } else { req->needs_release = false; } - req->other_rank = rank; + req->other_rank = rank; req->is_send_request = true; - req->req = ucp_req; + req->req = ucp_req; } /** * @brief Asynchronously receive data from given endpoint with the given tag. */ - void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr, - void *buf, size_t size, int tag, ucp_tag_t tag_mask, - int sender_rank) const { + void ucp_irecv(ucp_request* req, + ucp_worker_h worker, + ucp_ep_h ep_ptr, + void* buf, + size_t size, + int tag, + ucp_tag_t tag_mask, + int sender_rank) const + { ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); ucs_status_ptr_t recv_result = - (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, - tag_mask, recv_callback); + (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback); - struct ucx_context *ucp_req = (struct ucx_context *)recv_result; + struct ucx_context* ucp_req = (struct ucx_context*)recv_result; - req->req = ucp_req; - req->needs_release = true; + req->req = ucp_req; + req->needs_release = true; req->is_send_request = false; - req->other_rank = sender_rank; + req->other_rank = sender_rank; ASSERT(!UCS_PTR_IS_ERR(recv_result), "unable to receive UCX data message (%d)\n", diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp index f3216abc37..1b0548fc00 100644 --- a/cpp/include/raft/comms/util.hpp +++ b/cpp/include/raft/comms/util.hpp @@ -26,88 +26,70 @@ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an * exception detailing the NCCL error that occurred */ -#define NCCL_TRY(call) \ - do { \ - ncclResult_t const status = (call); \ - if (ncclSuccess != status) { \ - std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, ncclGetErrorString(status)); \ - throw raft::logic_error(msg); \ - } \ +#define NCCL_TRY(call) \ + do { \ + ncclResult_t const status = (call); \ + if (ncclSuccess != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "NCCL error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ + ncclGetErrorString(status)); \ + throw raft::logic_error(msg); \ + } \ } while (0); -#define NCCL_TRY_NO_THROW(call) \ - do { \ - ncclResult_t status = call; \ - if (ncclSuccess != status) { \ - printf("NCCL call='%s' failed. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } \ +#define NCCL_TRY_NO_THROW(call) \ + do { \ + ncclResult_t status = call; \ + if (ncclSuccess != status) { \ + printf("NCCL call='%s' failed. Reason:%s\n", #call, ncclGetErrorString(status)); \ + } \ } while (0) namespace raft { namespace comms { -constexpr size_t get_datatype_size(const datatype_t datatype) { +constexpr size_t get_datatype_size(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return sizeof(char); - case datatype_t::UINT8: - return sizeof(uint8_t); - case datatype_t::INT32: - return sizeof(int); - case datatype_t::UINT32: - return sizeof(unsigned int); - case datatype_t::INT64: - return sizeof(int64_t); - case datatype_t::UINT64: - return sizeof(uint64_t); - case datatype_t::FLOAT32: - return sizeof(float); - case datatype_t::FLOAT64: - return sizeof(double); - default: - throw "Unsupported datatype"; + case datatype_t::CHAR: return sizeof(char); + case datatype_t::UINT8: return sizeof(uint8_t); + case datatype_t::INT32: return sizeof(int); + case datatype_t::UINT32: return sizeof(unsigned int); + case datatype_t::INT64: return sizeof(int64_t); + case datatype_t::UINT64: return sizeof(uint64_t); + case datatype_t::FLOAT32: return sizeof(float); + case datatype_t::FLOAT64: return sizeof(double); + default: throw "Unsupported datatype"; } } -constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { +constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return ncclChar; - case datatype_t::UINT8: - return ncclUint8; - case datatype_t::INT32: - return ncclInt; - case datatype_t::UINT32: - return ncclUint32; - case datatype_t::INT64: - return ncclInt64; - case datatype_t::UINT64: - return ncclUint64; - case datatype_t::FLOAT32: - return ncclFloat; - case datatype_t::FLOAT64: - return ncclDouble; - default: - throw "Unsupported datatype"; + case datatype_t::CHAR: return ncclChar; + case datatype_t::UINT8: return ncclUint8; + case datatype_t::INT32: return ncclInt; + case datatype_t::UINT32: return ncclUint32; + case datatype_t::INT64: return ncclInt64; + case datatype_t::UINT64: return ncclUint64; + case datatype_t::FLOAT32: return ncclFloat; + case datatype_t::FLOAT64: return ncclDouble; + default: throw "Unsupported datatype"; } } -constexpr ncclRedOp_t get_nccl_op(const op_t op) { +constexpr ncclRedOp_t get_nccl_op(const op_t op) +{ switch (op) { - case op_t::SUM: - return ncclSum; - case op_t::PROD: - return ncclProd; - case op_t::MIN: - return ncclMin; - case op_t::MAX: - return ncclMax; - default: - throw "Unsupported datatype"; + case op_t::SUM: return ncclSum; + case op_t::PROD: return ncclProd; + case op_t::MIN: return ncclMin; + case op_t::MAX: return ncclMax; + default: throw "Unsupported datatype"; } } }; // namespace comms diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index 14274043f5..8a66eff242 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -36,16 +36,17 @@ namespace raft { /** helper macro for device inlined functions */ -#define DI inline __device__ +#define DI inline __device__ #define HDI inline __host__ __device__ -#define HD __host__ __device__ +#define HD __host__ __device__ /** * @brief Provide a ceiling division operation ie. ceil(a / b) * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType ceildiv(IntType a, IntType b) { +constexpr HDI IntType ceildiv(IntType a, IntType b) +{ return (a + b - 1) / b; } @@ -54,7 +55,8 @@ constexpr HDI IntType ceildiv(IntType a, IntType b) { * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignTo(IntType a, IntType b) { +constexpr HDI IntType alignTo(IntType a, IntType b) +{ return ceildiv(a, b) * b; } @@ -63,7 +65,8 @@ constexpr HDI IntType alignTo(IntType a, IntType b) { * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignDown(IntType a, IntType b) { +constexpr HDI IntType alignDown(IntType a, IntType b) +{ return (a / b) * b; } @@ -72,7 +75,8 @@ constexpr HDI IntType alignDown(IntType a, IntType b) { * @tparam IntType data type (checked only for integers) */ template -constexpr HDI bool isPo2(IntType num) { +constexpr HDI bool isPo2(IntType num) +{ return (num && !(num & (num - 1))); } @@ -81,14 +85,16 @@ constexpr HDI bool isPo2(IntType num) { * @tparam IntType data type (checked only for integers) */ template -constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) { +constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) +{ return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret); } /** Device function to apply the input lambda across threads in the grid */ template -DI void forEach(int num, L lambda) { - int idx = (blockDim.x * blockIdx.x) + threadIdx.x; +DI void forEach(int num, L lambda) +{ + int idx = (blockDim.x * blockIdx.x) + threadIdx.x; const int numThreads = blockDim.x * gridDim.x; #pragma unroll for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) { @@ -100,7 +106,8 @@ DI void forEach(int num, L lambda) { static const int WarpSize = 32; /** get the laneId of the current thread */ -DI int laneId() { +DI int laneId() +{ int id; asm("mov.s32 %0, %laneid;" : "=r"(id)); return id; @@ -113,15 +120,17 @@ DI int laneId() { * @param b second input */ template -HDI void swapVals(T &a, T &b) { +HDI void swapVals(T& a, T& b) +{ T tmp = a; - a = b; - b = tmp; + a = b; + b = tmp; } /** Device function to have atomic add support for older archs */ template -DI void myAtomicAdd(Type *address, Type val) { +DI void myAtomicAdd(Type* address, Type val) +{ atomicAdd(address, val); } @@ -129,105 +138,114 @@ DI void myAtomicAdd(Type *address, Type val) { // Ref: // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf template <> -DI void myAtomicAdd(double *address, double val) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); + old = + atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); } #endif template -DI void myAtomicReduce(T *address, T val, ReduceLambda op); +DI void myAtomicReduce(T* address, T val, ReduceLambda op); template -DI void myAtomicReduce(double *address, double val, ReduceLambda op) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicReduce(double* address, double val, ReduceLambda op) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = - atomicCAS(address_as_ull, assumed, - __double_as_longlong(op(val, __longlong_as_double(assumed)))); + old = atomicCAS( + address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(float *address, float val, ReduceLambda op) { - unsigned int *address_as_uint = (unsigned int *)address; - unsigned int old = *address_as_uint, assumed; +DI void myAtomicReduce(float* address, float val, ReduceLambda op) +{ + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint, assumed; do { assumed = old; - old = atomicCAS(address_as_uint, assumed, - __float_as_uint(op(val, __uint_as_float(assumed)))); + old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(int *address, int val, ReduceLambda op) { +DI void myAtomicReduce(int* address, int val, ReduceLambda op) +{ int old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) { +DI void myAtomicReduce(long long* address, long long val, ReduceLambda op) +{ long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(unsigned long long *address, unsigned long long val, - ReduceLambda op) { +DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op) +{ unsigned long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T *address, T val); +DI T myAtomicMin(T* address, T val); /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T *address, T val); +DI T myAtomicMax(T* address, T val); -DI float myAtomicMin(float *address, float val) { +DI float myAtomicMin(float* address, float val) +{ myAtomicReduce(address, val, fminf); return *address; } -DI float myAtomicMax(float *address, float val) { +DI float myAtomicMax(float* address, float val) +{ myAtomicReduce(address, val, fmaxf); return *address; } -DI double myAtomicMin(double *address, double val) { +DI double myAtomicMin(double* address, double val) +{ myAtomicReduce(address, val, fmin); return *address; } -DI double myAtomicMax(double *address, double val) { +DI double myAtomicMax(double* address, double val) +{ myAtomicReduce(address, val, fmax); return *address; } @@ -239,11 +257,13 @@ DI double myAtomicMax(double *address, double val) { template HDI T myMax(T x, T y); template <> -HDI float myMax(float x, float y) { +HDI float myMax(float x, float y) +{ return fmaxf(x, y); } template <> -HDI double myMax(double x, double y) { +HDI double myMax(double x, double y) +{ return fmax(x, y); } /** @} */ @@ -255,11 +275,13 @@ HDI double myMax(double x, double y) { template HDI T myMin(T x, T y); template <> -HDI float myMin(float x, float y) { +HDI float myMin(float x, float y) +{ return fminf(x, y); } template <> -HDI double myMin(double x, double y) { +HDI double myMin(double x, double y) +{ return fmin(x, y); } /** @} */ @@ -267,11 +289,13 @@ HDI double myMin(double x, double y) { /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T *address, T val) { +DI T myAtomicMin(T* address, T val) +{ myAtomicReduce(address, val, myMin); return *address; } @@ -279,11 +303,13 @@ DI T myAtomicMin(T *address, T val) { /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T *address, T val) { +DI T myAtomicMax(T* address, T val) +{ myAtomicReduce(address, val, myMax); return *address; } @@ -292,7 +318,8 @@ DI T myAtomicMax(T *address, T val) { * Sign function */ template -HDI int sgn(const T val) { +HDI int sgn(const T val) +{ return (T(0) < val) - (val < T(0)); } @@ -303,11 +330,13 @@ HDI int sgn(const T val) { template HDI T myExp(T x); template <> -HDI float myExp(float x) { +HDI float myExp(float x) +{ return expf(x); } template <> -HDI double myExp(double x) { +HDI double myExp(double x) +{ return exp(x); } /** @} */ @@ -319,11 +348,13 @@ HDI double myExp(double x) { template inline __device__ T myInf(); template <> -inline __device__ float myInf() { +inline __device__ float myInf() +{ return CUDART_INF_F; } template <> -inline __device__ double myInf() { +inline __device__ double myInf() +{ return CUDART_INF; } /** @} */ @@ -335,11 +366,13 @@ inline __device__ double myInf() { template HDI T myLog(T x); template <> -HDI float myLog(float x) { +HDI float myLog(float x) +{ return logf(x); } template <> -HDI double myLog(double x) { +HDI double myLog(double x) +{ return log(x); } /** @} */ @@ -351,11 +384,13 @@ HDI double myLog(double x) { template HDI T mySqrt(T x); template <> -HDI float mySqrt(float x) { +HDI float mySqrt(float x) +{ return sqrtf(x); } template <> -HDI double mySqrt(double x) { +HDI double mySqrt(double x) +{ return sqrt(x); } /** @} */ @@ -365,13 +400,15 @@ HDI double mySqrt(double x) { * @{ */ template -DI void mySinCos(T x, T &s, T &c); +DI void mySinCos(T x, T& s, T& c); template <> -DI void mySinCos(float x, float &s, float &c) { +DI void mySinCos(float x, float& s, float& c) +{ sincosf(x, &s, &c); } template <> -DI void mySinCos(double x, double &s, double &c) { +DI void mySinCos(double x, double& s, double& c) +{ sincos(x, &s, &c); } /** @} */ @@ -383,11 +420,13 @@ DI void mySinCos(double x, double &s, double &c) { template DI T mySin(T x); template <> -DI float mySin(float x) { +DI float mySin(float x) +{ return sinf(x); } template <> -DI double mySin(double x) { +DI double mySin(double x) +{ return sin(x); } /** @} */ @@ -397,15 +436,18 @@ DI double mySin(double x) { * @{ */ template -DI T myAbs(T x) { +DI T myAbs(T x) +{ return x < 0 ? -x : x; } template <> -DI float myAbs(float x) { +DI float myAbs(float x) +{ return fabsf(x); } template <> -DI double myAbs(double x) { +DI double myAbs(double x) +{ return fabs(x); } /** @} */ @@ -417,11 +459,13 @@ DI double myAbs(double x) { template HDI T myPow(T x, T power); template <> -HDI float myPow(float x, float power) { +HDI float myPow(float x, float power) +{ return powf(x, power); } template <> -HDI double myPow(double x, double power) { +HDI double myPow(double x, double power) +{ return pow(x, power); } /** @} */ @@ -433,11 +477,13 @@ HDI double myPow(double x, double power) { template HDI T myTanh(T x); template <> -HDI float myTanh(float x) { +HDI float myTanh(float x) +{ return tanhf(x); } template <> -HDI double myTanh(double x) { +HDI double myTanh(double x) +{ return tanh(x); } /** @} */ @@ -449,11 +495,13 @@ HDI double myTanh(double x) { template HDI T myATanh(T x); template <> -HDI float myATanh(float x) { +HDI float myATanh(float x) +{ return atanhf(x); } template <> -HDI double myATanh(double x) { +HDI double myATanh(double x) +{ return atanh(x); } /** @} */ @@ -492,15 +540,18 @@ struct Sum { * @{ */ template -DI T signPrim(T x) { +DI T signPrim(T x) +{ return x < 0 ? -1 : +1; } template <> -DI float signPrim(float x) { +DI float signPrim(float x) +{ return signbit(x) == true ? -1.0f : +1.0f; } template <> -DI double signPrim(double x) { +DI double signPrim(double x) +{ return signbit(x) == true ? -1.0 : +1.0; } /** @} */ @@ -514,28 +565,33 @@ DI double signPrim(double x) { * @{ */ template -DI T maxPrim(T x, T y) { +DI T maxPrim(T x, T y) +{ return x > y ? x : y; } template <> -DI float maxPrim(float x, float y) { +DI float maxPrim(float x, float y) +{ return fmaxf(x, y); } template <> -DI double maxPrim(double x, double y) { +DI double maxPrim(double x, double y) +{ return fmax(x, y); } /** @} */ /** apply a warp-wide fence (useful from Volta+ archs) */ -DI void warpFence() { +DI void warpFence() +{ #if __CUDA_ARCH__ >= 700 __syncwarp(); #endif } /** warp-wide any boolean aggregator */ -DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { +DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 inFlag = __any_sync(mask, inFlag); #else @@ -545,7 +601,8 @@ DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { } /** warp-wide all boolean aggregator */ -DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { +DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 inFlag = __all_sync(mask, inFlag); #else @@ -564,8 +621,8 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { * @return the shuffled data */ template -DI T shfl(T val, int srcLane, int width = WarpSize, - uint32_t mask = 0xffffffffu) { +DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 return __shfl_sync(mask, val, srcLane, width); #else @@ -583,8 +640,8 @@ DI T shfl(T val, int srcLane, int width = WarpSize, * @return the shuffled data */ template -DI T shfl_xor(T val, int laneMask, int width = WarpSize, - uint32_t mask = 0xffffffffu) { +DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 return __shfl_xor_sync(mask, val, laneMask, width); #else @@ -602,7 +659,8 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, * @todo Expand this to support arbitrary reduction ops */ template -DI T warpReduce(T val) { +DI T warpReduce(T val) +{ #pragma unroll for (int i = WarpSize / 2; i > 0; i >>= 1) { T tmp = shfl(val, laneId() + i); @@ -623,12 +681,13 @@ DI T warpReduce(T val) { * @todo Expand this to support arbitrary reduction ops */ template -DI T blockReduce(T val, char *smem) { - auto *sTemp = reinterpret_cast(smem); - int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; - int lid = laneId(); - int wid = threadIdx.x / WarpSize; - val = warpReduce(val); +DI T blockReduce(T val, char* smem) +{ + auto* sTemp = reinterpret_cast(smem); + int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; + int lid = laneId(); + int wid = threadIdx.x / WarpSize; + val = warpReduce(val); if (lid == 0) sTemp[wid] = val; __syncthreads(); val = lid < nWarps ? sTemp[lid] : T(0); @@ -644,8 +703,10 @@ DI T blockReduce(T val, char *smem) { * @param idx the index for which to query the stream */ inline cudaStream_t select_stream(cudaStream_t user_stream, - cudaStream_t *int_streams, int n_int_streams, - int idx) { + cudaStream_t* int_streams, + int n_int_streams, + int idx) +{ return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream; } diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 86c60addf2..872dab7d82 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -49,17 +49,20 @@ struct cuda_error : public raft::exception { * exception detailing the CUDA error that occurred * */ -#define CUDA_TRY(call) \ - do { \ - cudaError_t const status = call; \ - if (status != cudaSuccess) { \ - cudaGetLastError(); \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \ - cudaGetErrorName(status), cudaGetErrorString(status)); \ - throw raft::cuda_error(msg); \ - } \ +#define CUDA_TRY(call) \ + do { \ + cudaError_t const status = call; \ + if (status != cudaSuccess) { \ + cudaGetLastError(); \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "CUDA error encountered at: ", \ + "call='%s', Reason=%s:%s", \ + #call, \ + cudaGetErrorName(status), \ + cudaGetErrorString(status)); \ + throw raft::cuda_error(msg); \ + } \ } while (0) /** @@ -89,13 +92,16 @@ struct cuda_error : public raft::exception { // * @brief check for cuda runtime API errors but log error instead of raising // * exception. // */ -#define CUDA_CHECK_NO_THROW(call) \ - do { \ - cudaError_t const status = call; \ - if (cudaSuccess != status) { \ - printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \ - __FILE__, __LINE__, cudaGetErrorString(status)); \ - } \ +#define CUDA_CHECK_NO_THROW(call) \ + do { \ + cudaError_t const status = call; \ + if (cudaSuccess != status) { \ + printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ + #call, \ + __FILE__, \ + __LINE__, \ + cudaGetErrorString(status)); \ + } \ } while (0) namespace raft { @@ -103,9 +109,7 @@ namespace raft { /** Helper method to get to know warp size in device code */ __host__ __device__ constexpr inline int warp_size() { return 32; } -__host__ __device__ constexpr inline unsigned int warp_full_mask() { - return 0xffffffff; -} +__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; } /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping @@ -124,13 +128,16 @@ class grid_1d_thread_t { * @param elements_per_thread Typically, a single kernel thread processes more than a single * element; this affects the number of threads the grid must contain */ - grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d, size_t elements_per_thread = 1) + grid_1d_thread_t(size_t overall_num_elements, + size_t num_threads_per_block, + size_t max_num_blocks_1d, + size_t elements_per_thread = 1) : block_size(num_threads_per_block), - num_blocks(std::min((overall_num_elements + - (elements_per_thread * num_threads_per_block) - 1) / - (elements_per_thread * num_threads_per_block), - max_num_blocks_1d)) { + num_blocks( + std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) / + (elements_per_thread * num_threads_per_block), + max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -153,13 +160,14 @@ class grid_1d_warp_t { * specific features (amount of shared memory necessary, SM functional units use pattern etc.); * this can't be determined generically/automatically (as opposed to the number of blocks) */ - grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block, + grid_1d_warp_t(size_t overall_num_elements, + size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min( - (overall_num_elements + (num_threads_per_block / warp_size()) - 1) / - (num_threads_per_block / warp_size()), - max_num_blocks_1d)) { + num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) / + (num_threads_per_block / warp_size()), + max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -181,10 +189,12 @@ class grid_1d_block_t { * specific features (amount of shared memory necessary, SM functional units use pattern etc.); * this can't be determined generically/automatically (as opposed to the number of blocks) */ - grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block, + grid_1d_block_t(size_t overall_num_elements, + size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) { + num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -200,9 +210,9 @@ class grid_1d_block_t { * @param stream cuda stream */ template -void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { - CUDA_CHECK( - cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); +void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) +{ + CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); } /** @@ -213,23 +223,22 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { */ /** performs a host to device copy */ template -void update_device(Type* d_ptr, const Type* h_ptr, size_t len, - cudaStream_t stream) { +void update_device(Type* d_ptr, const Type* h_ptr, size_t len, cudaStream_t stream) +{ copy(d_ptr, h_ptr, len, stream); } /** performs a device to host copy */ template -void update_host(Type* h_ptr, const Type* d_ptr, size_t len, - cudaStream_t stream) { +void update_host(Type* h_ptr, const Type* d_ptr, size_t len, cudaStream_t stream) +{ copy(h_ptr, d_ptr, len, stream); } template -void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, - cudaStream_t stream) { - CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), - cudaMemcpyDeviceToDevice, stream)); +void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, cudaStream_t stream) +{ + CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); } /** @} */ @@ -238,8 +247,11 @@ void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, * @{ */ template -void print_host_vector(const char* variable_name, const T* host_mem, - size_t componentsCount, OutStream& out) { +void print_host_vector(const char* variable_name, + const T* host_mem, + size_t componentsCount, + OutStream& out) +{ out << variable_name << "=["; for (size_t i = 0; i < componentsCount; ++i) { if (i != 0) out << ","; @@ -249,11 +261,13 @@ void print_host_vector(const char* variable_name, const T* host_mem, } template -void print_device_vector(const char* variable_name, const T* devMem, - size_t componentsCount, OutStream& out) { +void print_device_vector(const char* variable_name, + const T* devMem, + size_t componentsCount, + OutStream& out) +{ T* host_mem = new T[componentsCount]; - CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), - cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost)); print_host_vector(variable_name, host_mem, componentsCount, out); delete[] host_mem; } @@ -261,35 +275,36 @@ void print_device_vector(const char* variable_name, const T* devMem, /** cuda malloc */ template -void allocate(Type*& ptr, size_t len, bool setZero = false) { +void allocate(Type*& ptr, size_t len, bool setZero = false) +{ CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len)); if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len)); } /** helper method to get max usable shared mem per block parameter */ -inline int getSharedMemPerBlock() { +inline int getSharedMemPerBlock() +{ int devId; CUDA_CHECK(cudaGetDevice(&devId)); int smemPerBlk; - CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, - cudaDevAttrMaxSharedMemoryPerBlock, devId)); + CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId)); return smemPerBlk; } /** helper method to get multi-processor count parameter */ -inline int getMultiProcessorCount() { +inline int getMultiProcessorCount() +{ int devId; CUDA_CHECK(cudaGetDevice(&devId)); int mpCount; - CUDA_CHECK( - cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); + CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); return mpCount; } /** helper method to convert an array on device to a string on host */ template -std::string arr2Str(const T* arr, int size, std::string name, - cudaStream_t stream, int width = 4) { +std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4) +{ std::stringstream ss; T* arr_h = (T*)malloc(size * sizeof(T)); @@ -311,53 +326,54 @@ std::string arr2Str(const T* arr, int size, std::string name, /** this seems to be unused, but may be useful in the future */ template -void ASSERT_DEVICE_MEM(T* ptr, std::string name) { +void ASSERT_DEVICE_MEM(T* ptr, std::string name) +{ cudaPointerAttributes s_att; cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); if (s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in " << name - << ". device=" << s_att.device << ", err=" << s_err << std::endl; + std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device + << ", err=" << s_err << std::endl; } -inline uint32_t curTimeMillis() { - auto now = std::chrono::high_resolution_clock::now(); +inline uint32_t curTimeMillis() +{ + auto now = std::chrono::high_resolution_clock::now(); auto duration = now.time_since_epoch(); - return std::chrono::duration_cast(duration) - .count(); + return std::chrono::duration_cast(duration).count(); } /** Helper function to calculate need memory for allocate to store dense matrix. - * @param rows number of rows in matrix - * @param columns number of columns in matrix - * @return need number of items to allocate via allocate() - * @sa allocate() - */ -inline size_t allocLengthForMatrix(size_t rows, size_t columns) { - return rows * columns; -} + * @param rows number of rows in matrix + * @param columns number of columns in matrix + * @return need number of items to allocate via allocate() + * @sa allocate() + */ +inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; } /** Helper function to check alignment of pointer. - * @param ptr the pointer to check - * @param alignment to be checked for - * @return true if address in bytes is a multiple of alignment - */ + * @param ptr the pointer to check + * @param alignment to be checked for + * @return true if address in bytes is a multiple of alignment + */ template -bool is_aligned(Type* ptr, size_t alignment) { +bool is_aligned(Type* ptr, size_t alignment) +{ return reinterpret_cast(ptr) % alignment == 0; } /** calculate greatest common divisor of two numbers -* @a integer -* @b integer -* @ return gcd of a and b -*/ + * @a integer + * @b integer + * @ return gcd of a and b + */ template -IntType gcd(IntType a, IntType b) { +IntType gcd(IntType a, IntType b) +{ while (b != 0) { IntType tmp = b; - b = a % b; - a = tmp; + b = a % b; + a = tmp; } return a; } diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh index dc8093ca1d..e113ca92eb 100644 --- a/cpp/include/raft/device_atomics.cuh +++ b/cpp/include/raft/device_atomics.cuh @@ -39,9 +39,9 @@ namespace detail { /* @brief binary `sum` operator */ struct DeviceSum { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs + rhs; } }; @@ -49,7 +49,8 @@ struct DeviceSum { /* @brief binary `min` operator */ struct DeviceMin { template - __device__ T operator()(const T& lhs, const T& rhs) { + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs < rhs ? lhs : rhs; } }; @@ -57,43 +58,44 @@ struct DeviceMin { /* @brief binary `max` operator */ struct DeviceMax { template - __device__ T operator()(const T& lhs, const T& rhs) { + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs > rhs ? lhs : rhs; } }; /* @brief binary `product` operator */ struct DeviceProduct { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs * rhs; } }; /* @brief binary `and` operator */ struct DeviceAnd { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs & rhs); } }; /* @brief binary `or` operator */ struct DeviceOr { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs | rhs); } }; /* @brief binary `xor` operator */ struct DeviceXor { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs ^ rhs); } }; @@ -103,9 +105,9 @@ struct DeviceXor { #define errmsg_cast "size mismatch." template -__forceinline__ __device__ T_output type_reinterpret(T_input value) { - static_assert(sizeof(T_output) == sizeof(T_input), - "type_reinterpret for different size"); +__forceinline__ __device__ T_output type_reinterpret(T_input value) +{ + static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size"); return *(reinterpret_cast(&value)); } @@ -118,25 +120,22 @@ struct genericAtomicOperationImpl; // single byte atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned int; - T_int* address_uint32 = - reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); - T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = T((old >> shift) & 0xff); - uint8_t updating_value = - type_reinterpret(op(target_value, update_value)); - T_int new_value = - (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = T((old >> shift) & 0xff); + uint8_t updating_value = type_reinterpret(op(target_value, update_value)); + T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return T((old >> shift) & 0xff); @@ -146,26 +145,24 @@ struct genericAtomicOperationImpl { // 2 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { - using T_int = unsigned int; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { + using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = reinterpret_cast( - reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = + reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - uint16_t updating_value = - type_reinterpret(op(target_value, update_value)); - - T_int new_value = (is_32_align) - ? (old & 0xffff0000) | updating_value - : (old & 0xffff) | (T_int(updating_value) << 16); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); + uint16_t updating_value = type_reinterpret(op(target_value, update_value)); + + T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value + : (old & 0xffff) | (T_int(updating_value) << 16); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return (is_32_align) ? T(old & 0xffff) : T(old >> 16); @@ -176,15 +173,15 @@ struct genericAtomicOperationImpl { // 4 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned int; T old_value = *addr; T assumed{old_value}; do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -201,8 +198,8 @@ struct genericAtomicOperationImpl { // 8 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -210,7 +207,7 @@ struct genericAtomicOperationImpl { T assumed{old_value}; do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -226,8 +223,8 @@ struct genericAtomicOperationImpl { // ------------------------------------------------------------------------------------------------- // specialized functions for operators -// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is not supproted.) -// `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int +// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is +// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int // CUDA natively supports `unsigned long long int` for `atomicAdd`, @@ -240,12 +237,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -253,12 +249,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -273,12 +268,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -286,12 +280,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMin op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMin(reinterpret_cast(addr), - type_reinterpret(update_value)); + T ret = atomicMin(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -299,48 +292,44 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMax op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMax(reinterpret_cast(addr), - type_reinterpret(update_value)); + T ret = atomicMax(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceAnd op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAnd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAnd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceOr op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicOr(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicOr(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceXor op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicXor(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicXor(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -353,13 +342,12 @@ struct typesAtomicCASImpl; template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; - T_int shift = ((reinterpret_cast(addr) & 3) * 8); - T_int* address_uint32 = - reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); // the 'target_value' in `old` can be different from `compare` // because other thread may update the value @@ -370,15 +358,14 @@ struct typesAtomicCASImpl { uint8_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = T((old >> shift) & 0xff); // have to compare `target_value` and `compare` before calling atomicCAS // the `target_value` in `old` can be different with `compare` if (target_value != compare) break; - T_int new_value = - (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return target_value; @@ -387,13 +374,13 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = reinterpret_cast( - reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = + reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; @@ -401,12 +388,12 @@ struct typesAtomicCASImpl { uint16_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); if (target_value != compare) break; - T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val - : (old & 0xffff) | (T_int(u_val) << 16); + T_int new_value = + (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16); old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); @@ -416,8 +403,8 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; T_int ret = atomicCAS(reinterpret_cast(addr), @@ -431,8 +418,8 @@ struct typesAtomicCASImpl { // 8 bytes atomic operation template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -464,11 +451,10 @@ struct typesAtomicCASImpl { * @returns The old value at `address` * -------------------------------------------------------------------------**/ template -typename std::enable_if_t::value, T> __forceinline__ - __device__ - genericAtomicOperation(T* address, T const& update_value, BinaryOp op) { - auto fun = - raft::device_atomics::detail::genericAtomicOperationImpl{}; +typename std::enable_if_t::value, T> __forceinline__ __device__ +genericAtomicOperation(T* address, T const& update_value, BinaryOp op) +{ + auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -476,11 +462,11 @@ typename std::enable_if_t::value, T> __forceinline__ template __forceinline__ __device__ bool genericAtomicOperation(bool* address, bool const& update_value, - BinaryOp op) { + BinaryOp op) +{ using T = bool; // don't use underlying type to apply operation for bool - auto fun = - raft::device_atomics::detail::genericAtomicOperationImpl{}; + auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -502,9 +488,9 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicAdd(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceSum{}); +__forceinline__ __device__ T atomicAdd(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{}); } /** @@ -523,9 +509,9 @@ __forceinline__ __device__ T atomicAdd(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMin(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceMin{}); +__forceinline__ __device__ T atomicMin(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{}); } /** @@ -544,9 +530,9 @@ __forceinline__ __device__ T atomicMin(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMax(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceMax{}); +__forceinline__ __device__ T atomicMax(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{}); } /** @@ -566,9 +552,9 @@ __forceinline__ __device__ T atomicMax(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { - return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, - val); +__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) +{ + return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, val); } /** @@ -586,11 +572,10 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicAnd(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceAnd{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicAnd(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{}); } /** @@ -608,11 +593,10 @@ __forceinline__ __device__ T atomicAnd(T* address, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicOr(T* address, T val) { - return raft::genericAtomicOperation(address, val, - raft::device_atomics::detail::DeviceOr{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicOr(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{}); } /** @@ -630,9 +614,8 @@ __forceinline__ __device__ T atomicOr(T* address, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicXor(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceXor{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicXor(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{}); } diff --git a/cpp/include/raft/distance/canberra.cuh b/cpp/include/raft/distance/canberra.cuh index b87c295eb0..61622d7c87 100644 --- a/cpp/include/raft/distance/canberra.cuh +++ b/cpp/include/raft/distance/canberra.cuh @@ -44,75 +44,108 @@ namespace distance { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch work */ -template -static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void canberraImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - const auto add = raft::myAbs(x) + raft::myAbs(y); + const auto add = raft::myAbs(x) + raft::myAbs(y); // deal with potential for 0 in denominator by // forcing 1/0 instead acc += ((add != 0) * diff / (add + (add == 0))); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto canberraRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); + auto canberraRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); canberraRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto canberraColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); + auto canberraColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); canberraColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void canberra(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + canberraImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + canberraImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { canberraImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -137,16 +170,25 @@ void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void canberraImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - canberraOutType; + typedef typename std::conditional::type canberraOutType; Index_ lda, ldb, ldd; - canberraOutType *pDcast = reinterpret_cast(pD); + canberraOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; canberra( diff --git a/cpp/include/raft/distance/chebyshev.cuh b/cpp/include/raft/distance/chebyshev.cuh index 8d53408cf8..b7ecdb945b 100644 --- a/cpp/include/raft/distance/chebyshev.cuh +++ b/cpp/include/raft/distance/chebyshev.cuh @@ -44,72 +44,105 @@ namespace distance { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void chebyshevImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - acc = raft::myMax(acc, diff); + acc = raft::myMax(acc, diff); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto chebyshevRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - chebyshevRowMajor); + auto chebyshevRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevRowMajor); chebyshevRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto chebyshevColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - chebyshevColMajor); + auto chebyshevColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevColMajor); chebyshevColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void chebyshev(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + chebyshevImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + chebyshevImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { chebyshevImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -134,16 +167,25 @@ void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void chebyshevImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - chebyshevOutType; + typedef typename std::conditional::type chebyshevOutType; Index_ lda, ldb, ldd; - chebyshevOutType *pDcast = reinterpret_cast(pD); + chebyshevOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; chebyshev( diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh index ed9bd28b7f..3e034e15d2 100644 --- a/cpp/include/raft/distance/cosine.cuh +++ b/cpp/include/raft/distance/cosine.cuh @@ -24,7 +24,7 @@ namespace distance { /** * @brief the cosine distance matrix calculation implementer - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -49,30 +49,43 @@ namespace distance { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, - IdxT ldd, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void cosineImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -85,43 +98,66 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto cosineRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); + auto cosineRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); cosineRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto cosineColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); + auto cosineColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); cosineColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, const DataT *yn, - OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) { +template +void cosine(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream); + cosineImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream); + cosineImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { cosineImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -130,7 +166,7 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the expanded cosine distance matrix calculation - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam IType input data-type (for A and B matrices) * @tparam AccType accumulation data-type @@ -151,12 +187,23 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, AccType *workspace, - size_t worksize, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void cosineAlgo1(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + AccType* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); }; // Wrap fin_op to allow computing 1 - pA before calling fin_op @@ -165,39 +212,33 @@ void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, }; typedef std::is_same is_bool; - typedef typename std::conditional::type - CosOutType; - CosOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type CosOutType; + CosOutType* pDcast = reinterpret_cast(pD); - ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || - (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT( + !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType *col_vec = workspace; - InType *row_vec = workspace; + InType* col_vec = workspace; + InType* row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; cosine( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, - stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream); } else { lda = n, ldb = m, ldd = m; - cosine(n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, - wrapped_fin_op, stream); + cosine( + n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream); } } diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 1b39a6ec18..1627753b43 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -32,140 +32,314 @@ namespace raft { namespace distance { namespace { -template struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg = 2.0f) {} + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg = 2.0f) + { + } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::euclideanAlgo1(m, n, k, x, y, dist, false, - (AccType *)workspace, worksize, - fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::euclideanAlgo1( + m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::euclideanAlgo1(m, n, k, x, y, dist, true, - (AccType *)workspace, worksize, - fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::euclideanAlgo1( + m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { raft::distance::cosineAlgo1( - m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream, - isRowMajor); + m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::euclideanAlgo2(m, n, k, x, y, dist, false, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::euclideanAlgo2( + m, n, k, x, y, dist, false, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::euclideanAlgo2(m, n, k, x, y, dist, true, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::euclideanAlgo2( + m, n, k, x, y, dist, true, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { raft::distance::l1Impl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::chebyshevImpl(m, n, k, x, y, dist, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::chebyshevImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::hellingerImpl(m, n, k, x, y, dist, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::hellingerImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { - raft::distance::minkowskiImpl(m, n, k, x, y, dist, fin_op, stream, - isRowMajor, metric_arg); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::minkowskiImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { raft::distance::canberraImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } @@ -189,13 +363,15 @@ struct DistanceImpl -size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, - Index_ k) { - size_t worksize = 0; - constexpr bool is_allocated = - distanceType <= raft::distance::DistanceType::CosineExpanded; +template +size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k) +{ + size_t worksize = 0; + constexpr bool is_allocated = distanceType <= raft::distance::DistanceType::CosineExpanded; if (is_allocated) { worksize += m * sizeof(AccType); if (x != y) worksize += n * sizeof(AccType); @@ -228,17 +404,27 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, * as follows:
OutType fin_op(AccType in, int g_idx);
. If one needs * any other parameters, feel free to pass them via closure. */ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { - DistanceImpl - distImpl; - distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, - isRowMajor, metric_arg); +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ + DistanceImpl distImpl; + distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -263,18 +449,26 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, * @note if workspace is passed as nullptr, this will return in * worksize, the number of bytes of workspace required */ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { - auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { - return d_val; - }; - distance(x, y, dist, m, n, k, workspace, worksize, default_fin_op, - stream, isRowMajor, metric_arg); +template +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ + auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; }; + distance( + x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -298,39 +492,47 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, * @param isRowMajor whether the matrices are row-major or col-major */ template -void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, - Index_ n, Index_ k, - raft::mr::device::buffer &workspace, - cudaStream_t stream, bool isRowMajor, - Type metric_arg = 2.0f) { - auto worksize = - getWorkspaceSize(x, y, m, n, k); +void pairwise_distance_impl(const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, + raft::mr::device::buffer& workspace, + cudaStream_t stream, + bool isRowMajor, + Type metric_arg = 2.0f) +{ + auto worksize = getWorkspaceSize(x, y, m, n, k); workspace.resize(worksize, stream); - distance(x, y, dist, m, n, k, - workspace.data(), worksize, - stream, isRowMajor, metric_arg); + distance( + x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg); } template -void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, - Index_ n, Index_ k, - raft::mr::device::buffer &workspace, - raft::distance::DistanceType metric, cudaStream_t stream, - bool isRowMajor = true, Type metric_arg = 2.0f) { +void pairwise_distance(const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, + raft::mr::device::buffer& workspace, + raft::distance::DistanceType metric, + cudaStream_t stream, + bool isRowMajor = true, + Type metric_arg = 2.0f) +{ switch (metric) { case raft::distance::DistanceType::L2Expanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L2SqrtExpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L1: @@ -338,13 +540,11 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L2Unexpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::Linf: @@ -352,22 +552,18 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor, metric_arg); break; case raft::distance::DistanceType::Canberra: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; - default: - THROW("Unknown or unsupported distance metric '%d'!", (int)metric); + default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; } /** @} */ diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh index 484da0e5bf..46d0a1a4a9 100644 --- a/cpp/include/raft/distance/euclidean.cuh +++ b/cpp/include/raft/distance/euclidean.cuh @@ -48,30 +48,44 @@ namespace distance { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, - IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanExpImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -93,47 +107,68 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto euclideanExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); + auto euclideanExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); euclideanExpRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto euclideanExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); + auto euclideanExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); euclideanExpColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, bool sqrt, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void euclideanExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, - dOutput, fin_op, stream); + euclideanExpImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, - dOutput, fin_op, stream); + euclideanExpImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else { euclideanExpImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -161,53 +196,59 @@ void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, bool enable_sqrt, - AccType *workspace, size_t &worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { +template +void euclideanAlgo1(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + bool enable_sqrt, + AccType* workspace, + size_t& worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ auto norm_op = [] __device__(InType in) { return in; }; typedef std::is_same is_bool; - typedef typename std::conditional::type - ExpOutType; - ExpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type ExpOutType; + ExpOutType* pDcast = reinterpret_cast(pD); - ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || - (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT( + !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType *col_vec = workspace; - InType *row_vec = workspace; + InType* col_vec = workspace; + InType* row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; euclideanExp( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, - fin_op, stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; euclideanExp( - n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, - fin_op, stream); + n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream); } } /** - * @brief the unexpanded euclidean distance matrix calculation + * @brief the unexpanded euclidean distance matrix calculation * It computes the following equation: cij = op((ai-bj)^2) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -227,16 +268,30 @@ void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanUnExpImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -247,10 +302,11 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { if (sqrt) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -263,48 +319,68 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - auto euclideanUnExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - euclideanUnExpRowMajor); + auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor); euclideanUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto euclideanUnExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - euclideanUnExpColMajor); + auto euclideanUnExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpColMajor); euclideanUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanUnExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, - fin_op, stream); + euclideanUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, - fin_op, stream); + euclideanUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else { euclideanUnExpImpl( x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -330,15 +406,25 @@ void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, bool enable_sqrt, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) { +template +void euclideanAlgo2(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + bool enable_sqrt, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - UnExpOutType; - UnExpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type UnExpOutType; + UnExpOutType* pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index b96a536e38..f80b4eb8f7 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -35,24 +35,24 @@ template struct KVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce template struct MinAndDistanceReduceOp { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, KVP* out, const KVP& other) { + DI void operator()(LabelT rid, KVP* out, const KVP& other) + { if (other.value < out->value) { - out->key = other.key; + out->key = other.key; out->value = other.value; } } - DI void init(KVP* out, DataT maxVal) { - out->key = -1; + DI void init(KVP* out, DataT maxVal) + { + out->key = -1; out->value = maxVal; } }; @@ -60,30 +60,28 @@ struct MinAndDistanceReduceOp { template struct MinReduceOp { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, DataT* out, const KVP& other) { - if (other.value < *out) { - *out = other.value; - } + DI void operator()(LabelT rid, DataT* out, const KVP& other) + { + if (other.value < *out) { *out = other.value; } } DI void init(DataT* out, DataT maxVal) { *out = maxVal; } }; template -__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { +__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) +{ auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < m) { - redOp.init(min + tid, maxVal); - } + if (tid < m) { redOp.init(min + tid, maxVal); } } // TODO: specialize this function for MinAndDistanceReduceOp // with atomicCAS of 64 bit which will eliminate mutex and shfls -template -DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, - IdxT m, IdxT gridStrideY) { - const auto lid = threadIdx.x % raft::WarpSize; +template +DI void updateReducedVal( + int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY) +{ + const auto lid = threadIdx.x % raft::WarpSize; const auto accrowid = threadIdx.x / P::AccThCols; // for now have first lane from each warp update a unique output row. This @@ -108,21 +106,38 @@ DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, if (j < (raft::WarpSize / P::AccThCols) - 1) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); + auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols); - val[i] = {tmpkey, tmpvalue}; + val[i] = {tmpkey, tmpvalue}; } } } } -template -__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( - OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn, - IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp, - KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) { +__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + DataT maxVal, + int* mutex, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + CoreLambda core_op, + FinalLambda fin_op) +{ extern __shared__ char smem[]; typedef cub::KeyValuePair KVPair; @@ -135,7 +150,9 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( // epilogue operation lambda for final value calculation auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__( DataT acc[P::AccRowsPerTh][P::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { KVPReduceOpT pairRed_op(pairRedOp); @@ -164,72 +181,105 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { auto tmpkey = acccolid + j * P::AccThCols + gridStrideX; - KVPair tmp = {tmpkey, acc[i][j]}; + KVPair tmp = {tmpkey, acc[i][j]}; if (tmpkey < n) { - val[i] = - pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } } } }; - auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val, - maxVal] __device__(IdxT gridStrideY) { - KVPReduceOpT pairRed_op(pairRedOp); - ReduceOpT red_op(redOp); + auto rowEpilog_lambda = + [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) { + KVPReduceOpT pairRed_op(pairRedOp); + ReduceOpT red_op(redOp); - const auto accrowid = threadIdx.x / P::AccThCols; - const auto lid = raft::laneId(); + const auto accrowid = threadIdx.x / P::AccThCols; + const auto lid = raft::laneId(); // reduce #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { + for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll - for (int j = P::AccThCols / 2; j > 0; j >>= 1) { - auto tmpkey = raft::shfl(val[i].key, lid + j); - auto tmpvalue = raft::shfl(val[i].value, lid + j); - KVPair tmp = {tmpkey, tmpvalue}; - val[i] = - pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + for (int j = P::AccThCols / 2; j > 0; j >>= 1) { + auto tmpkey = raft::shfl(val[i].key, lid + j); + auto tmpvalue = raft::shfl(val[i].value, lid + j); + KVPair tmp = {tmpkey, tmpvalue}; + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + } } - } - updateReducedVal(mutex, min, val, red_op, - m, gridStrideY); + updateReducedVal(mutex, min, val, red_op, m, gridStrideY); // reset the val array. #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; - } - }; + for (int i = 0; i < P::AccRowsPerTh; ++i) { + val[i] = {-1, maxVal}; + } + }; IdxT lda = k, ldb = k, ldd = n; - PairwiseDistances - obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op, - epilog_lambda, fin_op, rowEpilog_lambda); + PairwiseDistances + obj(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + xn, + yn, + nullptr, + smem, + core_op, + epilog_lambda, + fin_op, + rowEpilog_lambda); obj.run(); } -template -void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { +template +void fusedL2NNImpl(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + int* workspace, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream) +{ typedef typename linalg::Policy4x4::Policy P; dim3 blk(P::Nthreads); - auto nblks = raft::ceildiv(m, P::Nthreads); + auto nblks = raft::ceildiv(m, P::Nthreads); constexpr auto maxVal = std::numeric_limits::max(); typedef cub::KeyValuePair KVPair; // Accumulation operation lambda - auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; }; CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); if (initOutBuffer) { @@ -240,25 +290,34 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; - constexpr size_t shmemSize = - P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); + constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); if (sqrt) { - auto fusedL2NNSqrt = - fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); + auto fusedL2NNSqrt = fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); fusedL2NNSqrt<<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, - core_lambda, fin_op); + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); } else { - auto fusedL2NN = - fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); - fusedL2NN<<>>(min, x, y, xn, yn, m, n, k, - maxVal, workspace, redOp, - pairRedOp, core_lambda, fin_op); + auto fusedL2NN = fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); + fusedL2NN<<>>( + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); @@ -299,25 +358,32 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, * main kernel launch * @param[in] stream cuda stream */ -template -void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { +template +void fusedL2NN(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + void* workspace, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream) +{ size_t bytes = sizeof(DataT) * k; if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } else { fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } } diff --git a/cpp/include/raft/distance/hellinger.cuh b/cpp/include/raft/distance/hellinger.cuh index f7ad3ed1ba..c8c7dad7d4 100644 --- a/cpp/include/raft/distance/hellinger.cuh +++ b/cpp/include/raft/distance/hellinger.cuh @@ -23,7 +23,7 @@ namespace distance { /** * @brief the Hellinger distance matrix using the expanded form: - * It computes the following equation: + * It computes the following equation: cij = sqrt(1 - sum(sqrt(x_k * y_k))) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -51,29 +51,40 @@ namespace distance { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void hellingerImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); - auto unaryOp_lambda = [] __device__(DataT input) { - return raft::mySqrt(input); - }; + auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); }; // First sqrt x and y raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); + (DataT*)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); + (DataT*)y, y, n * k, unaryOp_lambda, stream); } // Accumulation operation lambda @@ -84,71 +95,91 @@ static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - const auto finalVal = (1 - acc[i][j]); + const auto finalVal = (1 - acc[i][j]); const auto rectifier = (!signbit(finalVal)); - acc[i][j] = raft::mySqrt(rectifier * finalVal); + acc[i][j] = raft::mySqrt(rectifier * finalVal); } } }; if (isRowMajor) { - auto hellingerRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hellingerRowMajor); + auto hellingerRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerRowMajor); hellingerRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto hellingerColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hellingerColMajor); + auto hellingerColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerColMajor); hellingerColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } // Revert sqrt of x and y raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); + (DataT*)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); + (DataT*)y, y, n * k, unaryOp_lambda, stream); } CUDA_CHECK(cudaGetLastError()); } -template -void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void hellinger(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + hellingerImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + hellingerImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { hellingerImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -157,7 +188,7 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Hellinger distance matrix calculation - * It computes the following equation: + * It computes the following equation: sqrt(1 - sum(sqrt(x_k * y_k)) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -179,16 +210,25 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void hellingerImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - hellingerOutType; + typedef typename std::conditional::type hellingerOutType; Index_ lda, ldb, ldd; - hellingerOutType *pDcast = reinterpret_cast(pD); + hellingerOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; hellinger( diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh index 6ab084f041..268e269391 100644 --- a/cpp/include/raft/distance/l1.cuh +++ b/cpp/include/raft/distance/l1.cuh @@ -42,16 +42,29 @@ namespace distance { * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void l1Impl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -62,47 +75,69 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto l1RowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); + auto l1RowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); l1RowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto l1ColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); + auto l1ColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); l1ColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x, - const DataT *y, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void l1(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - l1Impl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + l1Impl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { l1Impl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -130,16 +165,25 @@ void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void l1Impl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void l1Impl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef - typename std::conditional::type L1OutType; + typedef typename std::conditional::type L1OutType; Index_ lda, ldb, ldd; - L1OutType *pDcast = reinterpret_cast(pD); + L1OutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; l1( diff --git a/cpp/include/raft/distance/minkowski.cuh b/cpp/include/raft/distance/minkowski.cuh index 803f5fc78a..c021954f32 100644 --- a/cpp/include/raft/distance/minkowski.cuh +++ b/cpp/include/raft/distance/minkowski.cuh @@ -21,7 +21,7 @@ namespace raft { namespace distance { /** - * @brief the unexpanded Minkowski distance matrix calculation + * @brief the unexpanded Minkowski distance matrix calculation * It computes the following equation: cij = sum(|x - y|^p)^(1/p) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -44,16 +44,30 @@ namespace distance { * @param[in] stream cuda stream to launch work * @param[in] the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream, DataT p) { +template +void minkowskiUnExpImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream, + DataT p) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -64,10 +78,11 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [p] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { const auto one_over_p = 1.0f / p; #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -79,48 +94,68 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - auto minkowskiUnExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - minkowskiUnExpRowMajor); + auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor); minkowskiUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto minkowskiUnExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - minkowskiUnExpColMajor); + auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor); minkowskiUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) { +template +void minkowskiUnExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream, + DataT metric_arg) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream, metric_arg); + minkowskiUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream, metric_arg); + minkowskiUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); } else { minkowskiUnExpImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); @@ -146,15 +181,25 @@ void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] isRowMajor whether the input and output matrices are row major * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +void minkowskiImpl(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - LpUnexpOutType; - LpUnexpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type LpUnexpOutType; + LpUnexpOutType* pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 43abc9eb65..3db4dc0131 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -31,11 +31,11 @@ namespace distance { * @tparam OutT output data-type (for C and D matrices) * @tparam IdxT index data-type * @tparam Policy struct which tunes the Contraction kernel - * @tparam CoreLambda tells how to accumulate an x and y into + * @tparam CoreLambda tells how to accumulate an x and y into acc. its signature: template void core_lambda(AccT& acc, const DataT& x, const DataT& y) - * @tparam EpilogueLambda applies an elementwise function to compute final + * @tparam EpilogueLambda applies an elementwise function to compute final values. Its signature is: template void epilogue_lambda (AccT acc[][], DataT* regxn, DataT* regyn); @@ -57,13 +57,19 @@ namespace distance { * @param fin_op the final gemm epilogue lambda */ -template > +template > struct PairwiseDistances : public BaseClass { private: typedef Policy P; @@ -81,11 +87,21 @@ struct PairwiseDistances : public BaseClass { public: // Constructor - DI PairwiseDistances(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, - const DataT* _xn, const DataT* _yn, OutT* _dOutput, - char* _smem, CoreLambda _core_op, - EpilogueLambda _epilog_op, FinalLambda _fin_op, + DI PairwiseDistances(const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + IdxT _lda, + IdxT _ldb, + IdxT _ldd, + const DataT* _xn, + const DataT* _yn, + OutT* _dOutput, + char* _smem, + CoreLambda _core_op, + EpilogueLambda _epilog_op, + FinalLambda _fin_op, rowEpilogueLambda _rowEpilog_op) : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem), xn(_xn), @@ -96,9 +112,12 @@ struct PairwiseDistances : public BaseClass { core_op(_core_op), epilog_op(_epilog_op), fin_op(_fin_op), - rowEpilog_op(_rowEpilog_op) {} + rowEpilog_op(_rowEpilog_op) + { + } - DI void run() { + DI void run() + { for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m; gridStrideY += P::Mblk * gridDim.y) { for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n; @@ -112,7 +131,8 @@ struct PairwiseDistances : public BaseClass { } private: - DI void updateIndicesY() { + DI void updateIndicesY() + { const auto stride = P::Nblk * gridDim.x; if (isRowMajor) { this->y += stride * this->ldb; @@ -122,21 +142,23 @@ struct PairwiseDistances : public BaseClass { this->yrowid += stride; } - DI void updateIndicesXY() { + DI void updateIndicesXY() + { const auto stride = P::Mblk * gridDim.y; if (isRowMajor) { this->x += stride * this->lda; this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid; - this->y = yBase + this->yrowid * this->ldb; + this->y = yBase + this->yrowid * this->ldb; } else { this->x += stride; this->yrowid = IdxT(blockIdx.x) * P::Nblk; - this->y = yBase + this->yrowid + this->srowid * this->ldb; + this->y = yBase + this->yrowid + this->srowid * this->ldb; } this->xrowid += stride; } - DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) { + DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) + { // Fetch next grid stride ldg if within range if ((gridStrideX + gridDim.x * P::Nblk) < this->n) { updateIndicesY(); @@ -147,10 +169,9 @@ struct PairwiseDistances : public BaseClass { } } - DI void prolog(IdxT gridStrideX, IdxT gridStrideY) { - if (gridStrideX == blockIdx.x * P::Nblk) { - this->ldgXY(0); - } + DI void prolog(IdxT gridStrideX, IdxT gridStrideY) + { + if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); } #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { @@ -165,7 +186,8 @@ struct PairwiseDistances : public BaseClass { this->pageWr ^= 1; } - DI void loop() { + DI void loop() + { for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { this->ldgXY(kidx); accumulate(); // on the previous k-block @@ -182,7 +204,8 @@ struct PairwiseDistances : public BaseClass { this->pageRd ^= 1; } - DI void accumulate() { + DI void accumulate() + { #pragma unroll for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { this->ldsXY(ki); @@ -199,7 +222,8 @@ struct PairwiseDistances : public BaseClass { } } - DI void epilog(IdxT gridStrideX, IdxT gridStrideY) { + DI void epilog(IdxT gridStrideX, IdxT gridStrideY) + { if (useNorms) { DataT* sxNorm = (DataT*)(&smem[P::SmemSize]); DataT* syNorm = (&sxNorm[P::Mblk]); @@ -207,13 +231,13 @@ struct PairwiseDistances : public BaseClass { // Load x & y norms required by this threadblock in shmem buffer if (gridStrideX == blockIdx.x * P::Nblk) { for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = gridStrideY + i; + auto idx = gridStrideY + i; sxNorm[i] = idx < this->m ? xn[idx] : 0; } } for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = gridStrideX + i; + auto idx = gridStrideX + i; syNorm[i] = idx < this->n ? yn[idx] : 0; } @@ -288,42 +312,67 @@ struct PairwiseDistances : public BaseClass { * @param fin_op the final gemm epilogue lambda */ -template -__global__ __launch_bounds__( - Policy::Nthreads, - 2) void pairwiseDistanceMatKernel(const DataT* x, const DataT* y, - const DataT* _xn, const DataT* _yn, IdxT m, - IdxT n, IdxT k, IdxT lda, IdxT ldb, - IdxT ldd, OutT* dOutput, CoreLambda core_op, - EpilogueLambda epilog_op, - FinalLambda fin_op) { +template +__global__ __launch_bounds__(Policy::Nthreads, + 2) void pairwiseDistanceMatKernel(const DataT* x, + const DataT* y, + const DataT* _xn, + const DataT* _yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + CoreLambda core_op, + EpilogueLambda epilog_op, + FinalLambda fin_op) +{ extern __shared__ char smem[]; auto rowEpilog = [] __device__(IdxT starty) { return; }; - PairwiseDistances - obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, - epilog_op, fin_op, rowEpilog); + PairwiseDistances + obj( + x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog); obj.run(); } template -dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) { - const auto numSMs = raft::getMultiProcessorCount(); +dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) +{ + const auto numSMs = raft::getMultiProcessorCount(); int numBlocksPerSm = 0; dim3 grid; - CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm, func, P::Nthreads, sMemSize)); + CUDA_CHECK( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize)); int minGridSize = numSMs * numBlocksPerSm; - int yChunks = raft::ceildiv(m, P::Mblk); - int xChunks = raft::ceildiv(n, P::Nblk); - grid.y = yChunks > minGridSize ? minGridSize : yChunks; - grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; + int yChunks = raft::ceildiv(m, P::Mblk); + int xChunks = raft::ceildiv(n, P::Nblk); + grid.y = yChunks > minGridSize ? minGridSize : yChunks; + grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; if (grid.x != 1) { int i = 1; while (grid.y * i < minGridSize) { diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index c62f2e5f79..773b83ab13 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -31,14 +31,14 @@ class exception : public std::exception { explicit exception() noexcept : std::exception(), msg_() {} /** copy ctor */ - exception(exception const& src) noexcept - : std::exception(), msg_(src.what()) { + exception(exception const& src) noexcept : std::exception(), msg_(src.what()) + { collect_call_stack(); } /** ctor from an input message */ - explicit exception(std::string const msg) noexcept - : std::exception(), msg_(std::move(msg)) { + explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg)) + { collect_call_stack(); } @@ -51,7 +51,8 @@ class exception : public std::exception { /** append call stack info to this exception's message for ease of debug */ // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html - void collect_call_stack() noexcept { + void collect_call_stack() noexcept + { #ifdef __GNUC__ constexpr int kMaxStackDepth = 64; void* stack[kMaxStackDepth]; // NOLINT @@ -90,16 +91,16 @@ struct logic_error : public raft::exception { // FIXME: Need to be replaced with RAFT_FAIL /** macro to throw a runtime error */ -#define THROW(fmt, ...) \ - do { \ - std::string msg; \ - char errMsg[2048]; /* NOLINT */ \ - std::snprintf(errMsg, sizeof(errMsg), \ - "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ - msg += errMsg; \ - std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ - msg += errMsg; \ - throw raft::exception(msg); \ +#define THROW(fmt, ...) \ + do { \ + std::string msg; \ + char errMsg[2048]; /* NOLINT */ \ + std::snprintf( \ + errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ + msg += errMsg; \ + std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ + msg += errMsg; \ + throw raft::exception(msg); \ } while (0) // FIXME: Need to be replaced with RAFT_EXPECTS @@ -109,16 +110,15 @@ struct logic_error : public raft::exception { if (!(check)) THROW(fmt, ##__VA_ARGS__); \ } while (0) -#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ - do { \ - char err_msg[2048]; /* NOLINT */ \ - std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \ - __LINE__); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ - msg += err_msg; \ +#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ + do { \ + char err_msg[2048]; /* NOLINT */ \ + std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ + msg += err_msg; \ } while (0) /** diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index dbe7e83189..bb7d22e079 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -65,29 +65,29 @@ class handle_t { }()), streams_(n_streams), device_allocator_(std::make_shared()), - host_allocator_(std::make_shared()) { + host_allocator_(std::make_shared()) + { create_resources(); } /** - * @brief Construct a light handle copy from another + * @brief Construct a light handle copy from another * user stream, cuda handles, comms and worker pool are not copied - * The user_stream of the returned handle is set to the specified stream - * of the other handle worker pool - * @param[in] stream_id stream id in `other` worker streams + * The user_stream of the returned handle is set to the specified stream + * of the other handle worker pool + * @param[in] stream_id stream id in `other` worker streams * to be set as user stream in the constructed handle * @param[in] n_streams number worker streams to be created */ - handle_t(const handle_t& other, int stream_id, - int n_streams = kNumDefaultWorkerStreams) - : dev_id_(other.get_device()), streams_(n_streams) { - RAFT_EXPECTS( - other.get_num_internal_streams() > 0, - "ERROR: the main handle must have at least one worker stream\n"); - prop_ = other.get_device_properties(); + handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams) + : dev_id_(other.get_device()), streams_(n_streams) + { + RAFT_EXPECTS(other.get_num_internal_streams() > 0, + "ERROR: the main handle must have at least one worker stream\n"); + prop_ = other.get_device_properties(); device_prop_initialized_ = true; - device_allocator_ = other.get_device_allocator(); - host_allocator_ = other.get_host_allocator(); + device_allocator_ = other.get_device_allocator(); + host_allocator_ = other.get_host_allocator(); create_resources(); set_stream(other.get_internal_stream(stream_id)); } @@ -99,25 +99,22 @@ class handle_t { void set_stream(cudaStream_t stream) { user_stream_ = stream; } cudaStream_t get_stream() const { return user_stream_; } - rmm::cuda_stream_view get_stream_view() const { - return rmm::cuda_stream_view(user_stream_); - } + rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); } - void set_device_allocator(std::shared_ptr allocator) { + void set_device_allocator(std::shared_ptr allocator) + { device_allocator_ = allocator; } - std::shared_ptr get_device_allocator() const { - return device_allocator_; - } + std::shared_ptr get_device_allocator() const { return device_allocator_; } - void set_host_allocator(std::shared_ptr allocator) { + void set_host_allocator(std::shared_ptr allocator) + { host_allocator_ = allocator; } - std::shared_ptr get_host_allocator() const { - return host_allocator_; - } + std::shared_ptr get_host_allocator() const { return host_allocator_; } - cublasHandle_t get_cublas_handle() const { + cublasHandle_t get_cublas_handle() const + { std::lock_guard _(mutex_); if (!cublas_initialized_) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); @@ -126,7 +123,8 @@ class handle_t { return cublas_handle_; } - cusolverDnHandle_t get_cusolver_dn_handle() const { + cusolverDnHandle_t get_cusolver_dn_handle() const + { std::lock_guard _(mutex_); if (!cusolver_dn_initialized_) { CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_)); @@ -135,7 +133,8 @@ class handle_t { return cusolver_dn_handle_; } - cusolverSpHandle_t get_cusolver_sp_handle() const { + cusolverSpHandle_t get_cusolver_sp_handle() const + { std::lock_guard _(mutex_); if (!cusolver_sp_initialized_) { CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_)); @@ -144,7 +143,8 @@ class handle_t { return cusolver_sp_handle_; } - cusparseHandle_t get_cusparse_handle() const { + cusparseHandle_t get_cusparse_handle() const + { std::lock_guard _(mutex_); if (!cusparse_initialized_) { CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); @@ -154,16 +154,13 @@ class handle_t { } // legacy compatibility for cuML - cudaStream_t get_internal_stream(int sid) const { - return streams_.get_stream(sid).value(); - } + cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); } // new accessor return rmm::cuda_stream_view - rmm::cuda_stream_view get_internal_stream_view(int sid) const { - return streams_.get_stream(sid); - } + rmm::cuda_stream_view get_internal_stream_view(int sid) const { return streams_.get_stream(sid); } int get_num_internal_streams() const { return streams_.get_pool_size(); } - std::vector get_internal_streams() const { + std::vector get_internal_streams() const + { std::vector int_streams_vec; for (int i = 0; i < get_num_internal_streams(); i++) { int_streams_vec.push_back(get_internal_stream(i)); @@ -171,49 +168,51 @@ class handle_t { return int_streams_vec; } - void wait_on_user_stream() const { + void wait_on_user_stream() const + { CUDA_CHECK(cudaEventRecord(event_, user_stream_)); for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0)); } } - void wait_on_internal_streams() const { + void wait_on_internal_streams() const + { for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i))); CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0)); } } - void set_comms(std::shared_ptr communicator) { - communicator_ = communicator; - } + void set_comms(std::shared_ptr communicator) { communicator_ = communicator; } - const comms::comms_t& get_comms() const { - RAFT_EXPECTS(this->comms_initialized(), - "ERROR: Communicator was not initialized\n"); + const comms::comms_t& get_comms() const + { + RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n"); return *communicator_; } - void set_subcomm(std::string key, std::shared_ptr subcomm) { + void set_subcomm(std::string key, std::shared_ptr subcomm) + { subcomms_[key] = subcomm; } - const comms::comms_t& get_subcomm(std::string key) const { - RAFT_EXPECTS(subcomms_.find(key) != subcomms_.end(), - "%s was not found in subcommunicators.", key.c_str()); + const comms::comms_t& get_subcomm(std::string key) const + { + RAFT_EXPECTS( + subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str()); auto subcomm = subcomms_.at(key); - RAFT_EXPECTS(nullptr != subcomm.get(), - "ERROR: Subcommunicator was not initialized"); + RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized"); return *subcomm; } bool comms_initialized() const { return (nullptr != communicator_.get()); } - const cudaDeviceProp& get_device_properties() const { + const cudaDeviceProp& get_device_properties() const + { std::lock_guard _(mutex_); if (!device_prop_initialized_) { CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_)); @@ -244,29 +243,28 @@ class handle_t { mutable bool device_prop_initialized_{false}; mutable std::mutex mutex_; - void create_resources() { - CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } + void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); } - void destroy_resources() { + void destroy_resources() + { ///@todo: enable *_NO_THROW variants once we have enabled logging if (cusparse_initialized_) { - //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); + // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); } if (cusolver_dn_initialized_) { - //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); + // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_)); } if (cusolver_sp_initialized_) { - //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); + // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_)); } if (cublas_initialized_) { - //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); + // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); CUBLAS_CHECK(cublasDestroy(cublas_handle_)); } - //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); + // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); CUDA_CHECK(cudaEventDestroy(event_)); } }; // class handle_t @@ -276,7 +274,8 @@ class handle_t { */ class stream_syncer { public: - explicit stream_syncer(const handle_t& handle) : handle_(handle) { + explicit stream_syncer(const handle_t& handle) : handle_(handle) + { handle_.wait_on_user_stream(); } ~stream_syncer() { handle_.wait_on_internal_streams(); } diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h index a7cfb9287b..5fc56de14b 100644 --- a/cpp/include/raft/integer_utils.h +++ b/cpp/include/raft/integer_utils.h @@ -34,15 +34,13 @@ namespace raft { * `modulus` is positive. */ template -inline S round_up_safe(S number_to_round, S modulus) { +inline S round_up_safe(S number_to_round, S modulus) +{ auto remainder = number_to_round % modulus; - if (remainder == 0) { - return number_to_round; - } + if (remainder == 0) { return number_to_round; } auto rounded_up = number_to_round - remainder + modulus; if (rounded_up < number_to_round) { - throw std::invalid_argument( - "Attempt to round up beyond the type's maximum value"); + throw std::invalid_argument("Attempt to round up beyond the type's maximum value"); } return rounded_up; } @@ -53,8 +51,9 @@ inline S round_up_safe(S number_to_round, S modulus) { * `modulus` is positive. */ template -inline S round_down_safe(S number_to_round, S modulus) { - auto remainder = number_to_round % modulus; +inline S round_down_safe(S number_to_round, S modulus) +{ + auto remainder = number_to_round % modulus; auto rounded_down = number_to_round - remainder; return rounded_down; } @@ -72,25 +71,28 @@ inline S round_down_safe(S number_to_round, S modulus) { * the result will be incorrect */ template -constexpr inline S div_rounding_up_unsafe(const S& dividend, - const T& divisor) noexcept { +constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept +{ return (dividend + divisor - 1) / divisor; } namespace detail { template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, I divisor) noexcept { + I dividend, + I divisor) noexcept +{ // TODO: This could probably be implemented faster - return (dividend > divisor) - ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) - : (dividend > 0); + return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) + : (dividend > 0); } template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, I divisor) noexcept { - auto quotient = dividend / divisor; + I dividend, + I divisor) noexcept +{ + auto quotient = dividend / divisor; auto remainder = dividend % divisor; return quotient + (remainder != 0); } @@ -110,16 +112,17 @@ constexpr inline I div_rounding_up_safe(std::integral_constant, * approach of using (dividend + divisor - 1) / divisor */ template -constexpr inline std::enable_if_t::value, I> -div_rounding_up_safe(I dividend, I divisor) noexcept { - using i_is_a_signed_type = - std::integral_constant::value>; +constexpr inline std::enable_if_t::value, I> div_rounding_up_safe( + I dividend, I divisor) noexcept +{ + using i_is_a_signed_type = std::integral_constant::value>; return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); } template -constexpr inline std::enable_if_t::value, bool> -is_a_power_of_two(I val) noexcept { +constexpr inline std::enable_if_t::value, bool> is_a_power_of_two( + I val) noexcept +{ return ((val - 1) & val) == 0; } @@ -147,14 +150,14 @@ is_a_power_of_two(I val) noexcept { * @return Absolute value if value type is signed. */ template -std::enable_if_t::value, T> constexpr inline absolute_value( - T value) { +std::enable_if_t::value, T> constexpr inline absolute_value(T value) +{ return std::abs(value); } // Unsigned type just returns itself. template -std::enable_if_t::value, T> constexpr inline absolute_value( - T value) { +std::enable_if_t::value, T> constexpr inline absolute_value(T value) +{ return value; } diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index 0da7da2eb6..0bbfa2bb3c 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -43,33 +43,35 @@ namespace label { * \param [in] allocator device allocator */ template -void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, +void getUniquelabels(value_t* y, + size_t n, + value_t** y_unique, + int* n_unique, cudaStream_t stream, - std::shared_ptr allocator) { + std::shared_ptr allocator) +{ raft::mr::device::buffer y2(allocator, stream, n); raft::mr::device::buffer y3(allocator, stream, n); raft::mr::device::buffer d_num_selected(allocator, stream, 1); - size_t bytes = 0; + size_t bytes = 0; size_t bytes2 = 0; // Query how much temporary storage we will need for cub operations // and allocate it cub::DeviceRadixSort::SortKeys(NULL, bytes, y, y2.data(), n); - cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), - d_num_selected.data(), n); + cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), d_num_selected.data(), n); bytes = max(bytes, bytes2); raft::mr::device::buffer cub_storage(allocator, stream, bytes); // Select Unique classes cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n); - cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(), - d_num_selected.data(), n); + cub::DeviceSelect::Unique( + cub_storage.data(), bytes, y2.data(), y3.data(), d_num_selected.data(), n); raft::update_host(n_unique, d_num_selected.data(), 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // Copy unique classes to output - *y_unique = - (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream); + *y_unique = (value_t*)allocator->allocate(*n_unique * sizeof(value_t), stream); raft::copy(*y_unique, y3.data(), *n_unique, stream); } @@ -92,16 +94,17 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, * \param [in] stream cuda stream */ template -void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes, - value_t *y_out, int idx, cudaStream_t stream) { +void getOvrlabels( + value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) +{ ASSERT(idx < n_classes, "Parameter idx should not be larger than the number " "of classes"); raft::linalg::unaryOp( - y_out, y, n, - [idx, y_unique] __device__(value_t y) { - return y == y_unique[idx] ? +1 : -1; - }, + y_out, + y, + n, + [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; }, stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -110,9 +113,14 @@ void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes, // +/-1, return array with the new class labels and corresponding indices. template -__global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, - Type *out, size_t N, Lambda filter_op, - bool zero_based = false) { +__global__ void map_label_kernel(Type* map_ids, + size_t N_labels, + Type* in, + Type* out, + size_t N, + Lambda filter_op, + bool zero_based = false) +{ int tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (!filter_op(in[tid])) { @@ -127,68 +135,75 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out the output monotonic array - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - * @param filter_op an optional function for specifying which values - * should have monotonically increasing labels applied to them. - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out the output monotonic array + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + * @param filter_op an optional function for specifying which values + * should have monotonically increasing labels applied to them. + */ template -void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, +void make_monotonic(Type* out, + Type* in, + size_t N, + cudaStream_t stream, Lambda filter_op, std::shared_ptr allocator, - bool zero_based = false) { + bool zero_based = false) +{ static const size_t TPB_X = 256; dim3 blocks(raft::ceildiv(N, TPB_X)); dim3 threads(TPB_X); - Type *map_ids; + Type* map_ids; int num_clusters; getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator); - map_label_kernel<<>>( - map_ids, num_clusters, in, out, N, filter_op, zero_based); + map_label_kernel + <<>>(map_ids, num_clusters, in, out, N, filter_op, zero_based); allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream); } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out output label array with labels assigned monotonically - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out output label array with labels assigned monotonically + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + */ template -void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, +void make_monotonic(Type* out, + Type* in, + size_t N, + cudaStream_t stream, std::shared_ptr allocator, - bool zero_based = false) { + bool zero_based = false) +{ make_monotonic( - out, in, N, stream, [] __device__(Type val) { return false; }, allocator, - zero_based); + out, in, N, stream, [] __device__(Type val) { return false; }, allocator, zero_based); } }; // namespace label }; // end namespace raft diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh index bed74581a2..1ee0659b0d 100644 --- a/cpp/include/raft/label/merge_labels.cuh +++ b/cpp/include/raft/label/merge_labels.cuh @@ -35,8 +35,10 @@ __global__ void __launch_bounds__(TPB_X) propagate_label_kernel(const value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, value_idx* __restrict__ R, - const bool* __restrict__ mask, bool* __restrict__ m, - value_idx N) { + const bool* __restrict__ mask, + bool* __restrict__ m, + value_idx N) +{ value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (__ldg((char*)mask + tid)) { @@ -65,15 +67,17 @@ template __global__ void __launch_bounds__(TPB_X) reassign_label_kernel(value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, - const value_idx* __restrict__ R, value_idx N, - value_idx MAX_LABEL) { + const value_idx* __restrict__ R, + value_idx N, + value_idx MAX_LABEL) +{ value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { // Note: labels are from 1 to N - value_idx la = labels_a[tid]; - value_idx lb = __ldg(labels_b + tid); - value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; - value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; + value_idx la = labels_a[tid]; + value_idx lb = __ldg(labels_b + tid); + value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; + value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; labels_a[tid] = min(ra, rb); } } @@ -108,9 +112,14 @@ __global__ void __launch_bounds__(TPB_X) * @param[in] stream CUDA stream */ template -void merge_labels(value_idx* labels_a, const value_idx* labels_b, - const bool* mask, value_idx* R, bool* m, value_idx N, - cudaStream_t stream) { +void merge_labels(value_idx* labels_a, + const value_idx* labels_b, + const bool* mask, + value_idx* R, + bool* m, + value_idx N, + cudaStream_t stream) +{ dim3 blocks(raft::ceildiv(N, value_idx(TPB_X))); dim3 threads(TPB_X); value_idx MAX_LABEL = std::numeric_limits::max(); diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/d_structs.h index ed545b7198..e488dc528f 100644 --- a/cpp/include/raft/lap/d_structs.h +++ b/cpp/include/raft/lap/d_structs.h @@ -26,18 +26,18 @@ template struct Vertices { - vertex_t *row_assignments; - vertex_t *col_assignments; - int *row_covers; - int *col_covers; - weight_t *row_duals; - weight_t *col_duals; - weight_t *col_slacks; + vertex_t* row_assignments; + vertex_t* col_assignments; + int* row_covers; + int* col_covers; + weight_t* row_duals; + weight_t* col_duals; + weight_t* col_slacks; }; template struct VertexData { - vertex_t *parents; - vertex_t *children; - int *is_visited; + vertex_t* parents; + vertex_t* children; + int* is_visited; }; diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index 6bc1c08029..64b6a31efb 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -38,12 +38,12 @@ class LinearAssignmentProblem { vertex_t batchsize_; weight_t epsilon_; - weight_t const *d_costs_; + weight_t const* d_costs_; Vertices d_vertices_dev; VertexData d_row_data_dev, d_col_data_dev; - raft::handle_t const &handle_; + raft::handle_t const& handle_; raft::mr::device::buffer row_covers_v; raft::mr::device::buffer col_covers_v; raft::mr::device::buffer row_duals_v; @@ -59,8 +59,10 @@ class LinearAssignmentProblem { raft::mr::device::buffer obj_val_dual_v; public: - LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size, - vertex_t batchsize, weight_t epsilon) + LinearAssignmentProblem(raft::handle_t const& handle, + vertex_t size, + vertex_t batchsize, + weight_t epsilon) : handle_(handle), size_(size), batchsize_(batchsize), @@ -78,11 +80,13 @@ class LinearAssignmentProblem { row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {} + obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) + { + } // Executes Hungarian algorithm on the input cost matrix. - void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment, - vertex_t *d_col_assignment) { + void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment) + { initializeDevice(); d_vertices_dev.row_assignments = d_row_assignment; @@ -94,27 +98,13 @@ class LinearAssignmentProblem { while (step != 100) { switch (step) { - case 0: - step = hungarianStep0(); - break; - case 1: - step = hungarianStep1(); - break; - case 2: - step = hungarianStep2(); - break; - case 3: - step = hungarianStep3(); - break; - case 4: - step = hungarianStep4(); - break; - case 5: - step = hungarianStep5(); - break; - case 6: - step = hungarianStep6(); - break; + case 0: step = hungarianStep0(); break; + case 1: step = hungarianStep1(); break; + case 2: step = hungarianStep2(); break; + case 3: step = hungarianStep3(); break; + case 4: step = hungarianStep4(); break; + case 5: step = hungarianStep5(); break; + case 6: step = hungarianStep6(); break; } } @@ -122,36 +112,39 @@ class LinearAssignmentProblem { } // Function for getting optimal row dual vector for subproblem spId. - std::pair getRowDualVector(int spId) const { + std::pair getRowDualVector(int spId) const + { return std::make_pair(row_duals_v.data() + spId * size_, size_); } // Function for getting optimal col dual vector for subproblem spId. - std::pair getColDualVector(int spId) { + std::pair getColDualVector(int spId) + { return std::make_pair(col_duals_v.data() + spId * size_, size_); } // Function for getting optimal primal objective value for subproblem spId. - weight_t getPrimalObjectiveValue(int spId) { + weight_t getPrimalObjectiveValue(int spId) + { weight_t result; - raft::update_host(&result, obj_val_primal_v.data() + spId, 1, - handle_.get_stream()); + raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } // Function for getting optimal dual objective value for subproblem spId. - weight_t getDualObjectiveValue(int spId) { + weight_t getDualObjectiveValue(int spId) + { weight_t result; - raft::update_host(&result, obj_val_dual_v.data() + spId, 1, - handle_.get_stream()); + raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } private: // Helper function for initializing global variables and arrays on a single host. - void initializeDevice() { + void initializeDevice() + { row_covers_v.resize(batchsize_ * size_); col_covers_v.resize(batchsize_ * size_); row_duals_v.resize(batchsize_ * size_); @@ -169,39 +162,36 @@ class LinearAssignmentProblem { d_vertices_dev.row_covers = row_covers_v.data(); d_vertices_dev.col_covers = col_covers_v.data(); - d_vertices_dev.row_duals = row_duals_v.data(); - d_vertices_dev.col_duals = col_duals_v.data(); + d_vertices_dev.row_duals = row_duals_v.data(); + d_vertices_dev.col_duals = col_duals_v.data(); d_vertices_dev.col_slacks = col_slacks_v.data(); d_row_data_dev.is_visited = row_is_visited_v.data(); d_col_data_dev.is_visited = col_is_visited_v.data(); - d_row_data_dev.parents = row_parents_v.data(); - d_row_data_dev.children = row_children_v.data(); - d_col_data_dev.parents = col_parents_v.data(); - d_col_data_dev.children = col_children_v.data(); - - thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), - int{0}); - thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), - int{0}); - thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), - weight_t{0}); - thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), - weight_t{0}); + d_row_data_dev.parents = row_parents_v.data(); + d_row_data_dev.children = row_children_v.data(); + d_col_data_dev.parents = col_parents_v.data(); + d_col_data_dev.children = col_children_v.data(); + + thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0}); + thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0}); + thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0}); + thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0}); } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep0() { - detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, - size_); + int hungarianStep0() + { + detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_); return 1; } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep1() { - detail::computeInitialAssignments(handle_, d_costs_, d_vertices_dev, - batchsize_, size_, epsilon_); + int hungarianStep1() + { + detail::computeInitialAssignments( + handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_); int next = 2; @@ -217,10 +207,10 @@ class LinearAssignmentProblem { } // Function for checking optimality and constructing predicates and covers. - int hungarianStep2() { - int cover_count = - detail::computeRowCovers(handle_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, batchsize_, size_); + int hungarianStep2() + { + int cover_count = detail::computeRowCovers( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); int next = (cover_count == batchsize_ * size_) ? 6 : 3; @@ -228,17 +218,23 @@ class LinearAssignmentProblem { } // Function for building alternating tree rooted at unassigned rows. - int hungarianStep3() { + int hungarianStep3() + { int next; - raft::mr::device::buffer flag_v(handle_.get_device_allocator(), - handle_.get_stream(), 1); + raft::mr::device::buffer flag_v(handle_.get_device_allocator(), handle_.get_stream(), 1); bool h_flag = false; raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream()); - detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, flag_v.data(), batchsize_, size_, + detail::executeZeroCover(handle_, + d_costs_, + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + flag_v.data(), + batchsize_, + size_, epsilon_); raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream()); @@ -249,31 +245,36 @@ class LinearAssignmentProblem { } // Function for augmenting the solution along multiple node-disjoint alternating trees. - int hungarianStep4() { - detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, - size_); + int hungarianStep4() + { + detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_); - detail::augmentationPass(handle_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, batchsize_, size_); + detail::augmentationPass( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); return 2; } // Function for updating dual solution to introduce new zero-cost arcs. - int hungarianStep5() { - detail::dualUpdate(handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, - batchsize_, size_, epsilon_); + int hungarianStep5() + { + detail::dualUpdate( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_); return 3; } // Function for calculating primal and dual objective values at optimality. - int hungarianStep6() { - detail::calcObjValPrimal(handle_, obj_val_primal_v.data(), d_costs_, - d_vertices_dev.row_assignments, batchsize_, size_); + int hungarianStep6() + { + detail::calcObjValPrimal(handle_, + obj_val_primal_v.data(), + d_costs_, + d_vertices_dev.row_assignments, + batchsize_, + size_); - detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, - batchsize_, size_); + detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_); return 100; } diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index 0079f50e82..9bbd44bf09 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -46,20 +46,26 @@ const int BLOCKDIMX{64}; const int BLOCKDIMY{1}; // Function for calculating grid and block dimensions from the given input size. -inline void calculateLinearDims(dim3 &blocks_per_grid, dim3 &threads_per_block, - int &total_blocks, int size) { +inline void calculateLinearDims(dim3& blocks_per_grid, + dim3& threads_per_block, + int& total_blocks, + int size) +{ threads_per_block.x = BLOCKDIMX * BLOCKDIMY; int value = size / threads_per_block.x; if (size % threads_per_block.x > 0) value++; - total_blocks = value; + total_blocks = value; blocks_per_grid.x = value; } // Function for calculating grid and block dimensions from the given input size for square grid. -inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block, - int &total_blocks, int size) { +inline void calculateSquareDims(dim3& blocks_per_grid, + dim3& threads_per_block, + int& total_blocks, + int size) +{ threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -68,15 +74,16 @@ inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block, int valuex = (int)ceil((float)(sq_size) / BLOCKDIMX); int valuey = (int)ceil((float)(sq_size) / BLOCKDIMY); - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } -// Function for calculating grid and block dimensions from the given input size for rectangular grid. -inline void calculateRectangularDims(dim3 &blocks_per_grid, - dim3 &threads_per_block, int &total_blocks, - int xsize, int ysize) { +// Function for calculating grid and block dimensions from the given input size for rectangular +// grid. +inline void calculateRectangularDims( + dim3& blocks_per_grid, dim3& threads_per_block, int& total_blocks, int xsize, int ysize) +{ threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -86,16 +93,18 @@ inline void calculateRectangularDims(dim3 &blocks_per_grid, int valuey = ysize / threads_per_block.y; if (ysize % threads_per_block.y > 0) valuey++; - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } template -inline void initialReduction(raft::handle_t const &handle, - weight_t const *d_costs, - Vertices &d_vertices_dev, - int SP, vertex_t N) { +inline void initialReduction(raft::handle_t const& handle, + weight_t const* d_costs, + Vertices& d_vertices_dev, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -103,34 +112,38 @@ inline void initialReduction(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_rowReduction<<>>( - d_costs, d_vertices_dev.row_duals, SP, N, - std::numeric_limits::max()); + kernel_rowReduction<<>>( + d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); - kernel_columnReduction<<>>( - d_costs, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N, + kernel_columnReduction<<>>( + d_costs, + d_vertices_dev.row_duals, + d_vertices_dev.col_duals, + SP, + N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); } template -inline void computeInitialAssignments(raft::handle_t const &handle, - weight_t const *d_costs, - Vertices &d_vertices, - int SP, vertex_t N, weight_t epsilon) { +inline void computeInitialAssignments(raft::handle_t const& handle, + weight_t const* d_costs, + Vertices& d_vertices, + int SP, + vertex_t N, + weight_t epsilon) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; std::size_t size = SP * N; - raft::mr::device::buffer row_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer col_lock_v(handle.get_device_allocator(), - handle.get_stream(), size); + raft::mr::device::buffer row_lock_v( + handle.get_device_allocator(), handle.get_stream(), size); + raft::mr::device::buffer col_lock_v( + handle.get_device_allocator(), handle.get_stream(), size); thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1); thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1); @@ -140,21 +153,29 @@ inline void computeInitialAssignments(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeInitialAssignments<<>>( - d_costs, d_vertices.row_duals, d_vertices.col_duals, - d_vertices.row_assignments, d_vertices.col_assignments, row_lock_v.data(), - col_lock_v.data(), SP, N, epsilon); + kernel_computeInitialAssignments<<>>( + d_costs, + d_vertices.row_duals, + d_vertices.col_duals, + d_vertices.row_assignments, + d_vertices.col_assignments, + row_lock_v.data(), + col_lock_v.data(), + SP, + N, + epsilon); CHECK_CUDA(handle.get_stream()); } // Function for finding row cover on individual devices. template -inline int computeRowCovers(raft::handle_t const &handle, - Vertices &d_vertices, - VertexData &d_row_data, - VertexData &d_col_data, int SP, - vertex_t N) { +inline int computeRowCovers(raft::handle_t const& handle, + Vertices& d_vertices, + VertexData& d_row_data, + VertexData& d_col_data, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -163,8 +184,7 @@ inline int computeRowCovers(raft::handle_t const &handle, thrust::fill_n(thrust::device, d_vertices.row_covers, size, int{0}); thrust::fill_n(thrust::device, d_vertices.col_covers, size, int{0}); - thrust::fill_n(thrust::device, d_vertices.col_slacks, size, - std::numeric_limits::max()); + thrust::fill_n(thrust::device, d_vertices.col_slacks, size, std::numeric_limits::max()); thrust::fill_n(thrust::device, d_row_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_col_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_row_data.parents, size, vertex_t{-1}); @@ -174,25 +194,28 @@ inline int computeRowCovers(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeRowCovers<<>>( - d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, - SP, N); + kernel_computeRowCovers<<>>( + d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N); CHECK_CUDA(handle.get_stream()); - return thrust::reduce(thrust::device, d_vertices.row_covers, - d_vertices.row_covers + size); + return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size); } // Function for covering the zeros in uncovered rows and expanding the frontier. template -inline void coverZeroAndExpand( - raft::handle_t const &handle, weight_t const *d_costs_dev, - vertex_t const *d_rows_csr_neighbors, vertex_t const *d_rows_csr_ptrs, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, VertexData &d_col_data_dev, - bool *d_flag, int SP, vertex_t N, weight_t epsilon) { +inline void coverZeroAndExpand(raft::handle_t const& handle, + weight_t const* d_costs_dev, + vertex_t const* d_rows_csr_neighbors, + vertex_t const* d_rows_csr_ptrs, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; @@ -200,24 +223,34 @@ inline void coverZeroAndExpand( raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_coverAndExpand<<>>( - d_flag, d_rows_csr_ptrs, d_rows_csr_neighbors, d_costs_dev, d_vertices_dev, - d_row_data_dev, d_col_data_dev, SP, N, epsilon); + kernel_coverAndExpand<<>>( + d_flag, + d_rows_csr_ptrs, + d_rows_csr_neighbors, + d_costs_dev, + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + SP, + N, + epsilon); } template -inline vertex_t zeroCoverIteration(raft::handle_t const &handle, - weight_t const *d_costs_dev, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, - bool *d_flag, int SP, vertex_t N, - weight_t epsilon) { +inline vertex_t zeroCoverIteration(raft::handle_t const& handle, + weight_t const* d_costs_dev, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ vertex_t M; - raft::mr::device::buffer csr_ptrs_v(handle.get_device_allocator(), - handle.get_stream(), 0); + raft::mr::device::buffer csr_ptrs_v( + handle.get_device_allocator(), handle.get_stream(), 0); raft::mr::device::buffer csr_neighbors_v( handle.get_device_allocator(), handle.get_stream(), 0); @@ -226,8 +259,8 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, dim3 threads_per_block; int total_blocks = 0; - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); + raft::mr::device::buffer predicates_v( + handle.get_device_allocator(), handle.get_stream(), SP * N); raft::mr::device::buffer addresses_v( handle.get_device_allocator(), handle.get_stream(), SP * N); @@ -242,87 +275,108 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, blocks_per_grid, threads_per_block, total_blocks, N, SP); // construct predicate matrix for edges. - kernel_rowPredicateConstructionCSR<<>>( - predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, - N); + predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N); CHECK_CUDA(handle.get_stream()); M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); - thrust::exclusive_scan(thrust::device, addresses_v.begin(), - addresses_v.end(), addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (M > 0) { csr_neighbors_v.resize(M); - kernel_rowScatterCSR<<>>( - predicates_v.data(), addresses_v.data(), csr_neighbors_v.data(), - csr_ptrs_v.data(), M, SP, N); + kernel_rowScatterCSR<<>>( + predicates_v.data(), + addresses_v.data(), + csr_neighbors_v.data(), + csr_ptrs_v.data(), + M, + SP, + N); CHECK_CUDA(handle.get_stream()); } } if (M > 0) { - coverZeroAndExpand(handle, d_costs_dev, csr_neighbors_v.data(), - csr_ptrs_v.data(), d_vertices_dev, d_row_data_dev, - d_col_data_dev, d_flag, SP, N, epsilon); + coverZeroAndExpand(handle, + d_costs_dev, + csr_neighbors_v.data(), + csr_ptrs_v.data(), + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + d_flag, + SP, + N, + epsilon); } return M; } -// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending on the presence of uncovered zeros. +// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending +// on the presence of uncovered zeros. template -inline void executeZeroCover(raft::handle_t const &handle, - weight_t const *d_costs_dev, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, bool *d_flag, - int SP, vertex_t N, weight_t epsilon) { +inline void executeZeroCover(raft::handle_t const& handle, + weight_t const* d_costs_dev, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ vertex_t M = 1; while (M > 0) { - M = zeroCoverIteration(handle, d_costs_dev, d_vertices_dev, d_row_data_dev, - d_col_data_dev, d_flag, SP, N, epsilon); + M = zeroCoverIteration( + handle, d_costs_dev, d_vertices_dev, d_row_data_dev, d_col_data_dev, d_flag, SP, N, epsilon); } } // Function for executing reverse pass of the maximum matching. template -inline void reversePass(raft::handle_t const &handle, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, int N) { +inline void reversePass(raft::handle_t const& handle, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + int N) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; std::size_t size = SP * N; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, size); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), size); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), size); + raft::mr::device::buffer predicates_v( + handle.get_device_allocator(), handle.get_stream(), size); + raft::mr::device::buffer addresses_v( + handle.get_device_allocator(), handle.get_stream(), size); thrust::fill_n(thrust::device, predicates_v.data(), size, false); thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size); CHECK_CUDA(handle.get_stream()); // calculate total number of vertices. - std::size_t csr_size = - thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); + std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), - addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (csr_size > 0) { int total_blocks_1 = 0; @@ -334,14 +388,12 @@ inline void reversePass(raft::handle_t const &handle, raft::mr::device::buffer elements_v( handle.get_device_allocator(), handle.get_stream(), csr_size); - kernel_augmentScatter<<>>( + kernel_augmentScatter<<>>( elements_v.data(), predicates_v.data(), addresses_v.data(), size); CHECK_CUDA(handle.get_stream()); - kernel_reverseTraversal<<>>( + kernel_reverseTraversal<<>>( elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size); CHECK_CUDA(handle.get_stream()); } @@ -349,27 +401,30 @@ inline void reversePass(raft::handle_t const &handle, // Function for executing augmentation pass of the maximum matching. template -inline void augmentationPass(raft::handle_t const &handle, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, - int N) { +inline void augmentationPass(raft::handle_t const& handle, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + int N) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP * N); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); - raft::mr::device::buffer predicates_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); - raft::mr::device::buffer addresses_v(handle.get_device_allocator(), - handle.get_stream(), SP * N); + raft::mr::device::buffer predicates_v( + handle.get_device_allocator(), handle.get_stream(), SP * N); + raft::mr::device::buffer addresses_v( + handle.get_device_allocator(), handle.get_stream(), SP * N); thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false); thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N); @@ -380,8 +435,8 @@ inline void augmentationPass(raft::handle_t const &handle, vertex_t row_ids_csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), - addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (row_ids_csr_size > 0) { int total_blocks_1 = 0; @@ -393,17 +448,18 @@ inline void augmentationPass(raft::handle_t const &handle, raft::mr::device::buffer elements_v( handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size); - kernel_augmentScatter<<>>( - elements_v.data(), predicates_v.data(), addresses_v.data(), - vertex_t{SP * N}); + kernel_augmentScatter<<>>( + elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N}); CHECK_CUDA(handle.get_stream()); - kernel_augmentation<<>>( - d_vertices_dev.row_assignments, d_vertices_dev.col_assignments, - elements_v.data(), d_row_data_dev, d_col_data_dev, vertex_t{N}, + kernel_augmentation<<>>( + d_vertices_dev.row_assignments, + d_vertices_dev.col_assignments, + elements_v.data(), + d_row_data_dev, + d_col_data_dev, + vertex_t{N}, row_ids_csr_size); CHECK_CUDA(handle.get_stream()); @@ -411,35 +467,46 @@ inline void augmentationPass(raft::handle_t const &handle, } template -inline void dualUpdate(raft::handle_t const &handle, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, vertex_t N, - weight_t epsilon) { +inline void dualUpdate(raft::handle_t const& handle, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + vertex_t N, + weight_t epsilon) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks; - raft::mr::device::buffer sp_min_v(handle.get_device_allocator(), - handle.get_stream(), 1); + raft::mr::device::buffer sp_min_v( + handle.get_device_allocator(), handle.get_stream(), 1); - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); - kernel_dualUpdate_1<<>>( - sp_min_v.data(), d_vertices_dev.col_slacks, d_vertices_dev.col_covers, SP, - N, std::numeric_limits::max()); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); + kernel_dualUpdate_1<<>>( + sp_min_v.data(), + d_vertices_dev.col_slacks, + d_vertices_dev.col_covers, + SP, + N, + std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_dualUpdate_2<<>>( - sp_min_v.data(), d_vertices_dev.row_duals, d_vertices_dev.col_duals, - d_vertices_dev.col_slacks, d_vertices_dev.row_covers, - d_vertices_dev.col_covers, d_row_data_dev.is_visited, - d_col_data_dev.parents, SP, N, std::numeric_limits::max(), + kernel_dualUpdate_2<<>>( + sp_min_v.data(), + d_vertices_dev.row_duals, + d_vertices_dev.col_duals, + d_vertices_dev.col_slacks, + d_vertices_dev.row_covers, + d_vertices_dev.col_covers, + d_row_data_dev.is_visited, + d_col_data_dev.parents, + SP, + N, + std::numeric_limits::max(), epsilon); CHECK_CUDA(handle.get_stream()); @@ -447,18 +514,19 @@ inline void dualUpdate(raft::handle_t const &handle, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val, - Vertices &d_vertices_dev, int SP, - int N) { +inline void calcObjValDual(raft::handle_t const& handle, + weight_t* d_obj_val, + Vertices& d_vertices_dev, + int SP, + int N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); - kernel_calcObjValDual<<>>( + kernel_calcObjValDual<<>>( d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N); CHECK_CUDA(handle.get_stream()); @@ -466,20 +534,21 @@ inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValPrimal(raft::handle_t const &handle, weight_t *d_obj_val, - weight_t const *d_costs, - vertex_t const *d_row_assignments, int SP, - vertex_t N) { +inline void calcObjValPrimal(raft::handle_t const& handle, + weight_t* d_obj_val, + weight_t const* d_costs, + vertex_t const* d_row_assignments, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); - kernel_calcObjValPrimal<<>>(d_obj_val, d_costs, - d_row_assignments, SP, N); + kernel_calcObjValPrimal<<>>( + d_obj_val, d_costs, d_row_assignments, SP, N); CHECK_CUDA(handle.get_stream()); } diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh index 8c9012ed72..45ad23afd1 100644 --- a/cpp/include/raft/lap/lap_kernels.cuh +++ b/cpp/include/raft/lap/lap_kernels.cuh @@ -48,42 +48,57 @@ const int AUGMENT{4}; const int MODIFIED{5}; template -bool __device__ near_zero(weight_t w, weight_t epsilon) { +bool __device__ near_zero(weight_t w, weight_t epsilon) +{ return ((w > -epsilon) && (w < epsilon)); } template <> -bool __device__ near_zero(int32_t w, int32_t epsilon) { +bool __device__ near_zero(int32_t w, int32_t epsilon) +{ return (w == 0); } template <> -bool __device__ near_zero(int64_t w, int64_t epsilon) { +bool __device__ near_zero(int64_t w, int64_t epsilon) +{ return (w == 0); } -// Device function for traversing the neighbors from start pointer to end pointer and updating the covers. -// The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of Step 4 execution. +// Device function for traversing the neighbors from start pointer to end pointer and updating the +// covers. The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of +// Step 4 execution. template -__device__ void cover_and_expand_row( - weight_t const *d_elements, weight_t const *d_row_duals, - weight_t const *d_col_duals, weight_t *d_col_slacks, int *d_row_covers, - int *d_col_covers, vertex_t const *d_col_assignments, bool *d_flag, - vertex_t *d_row_parents, vertex_t *d_col_parents, int *d_row_visited, - int *d_col_visited, vertex_t rowid, int spid, int colid, vertex_t N, - weight_t epsilon) { +__device__ void cover_and_expand_row(weight_t const* d_elements, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + weight_t* d_col_slacks, + int* d_row_covers, + int* d_col_covers, + vertex_t const* d_col_assignments, + bool* d_flag, + vertex_t* d_row_parents, + vertex_t* d_col_parents, + int* d_row_visited, + int* d_col_visited, + vertex_t rowid, + int spid, + int colid, + vertex_t N, + weight_t epsilon) +{ int ROWID = spid * N + rowid; int COLID = spid * N + colid; - weight_t slack = d_elements[spid * N * N + rowid * N + colid] - - d_row_duals[ROWID] - d_col_duals[COLID]; + weight_t slack = + d_elements[spid * N * N + rowid * N + colid] - d_row_duals[ROWID] - d_col_duals[COLID]; int nxt_rowid = d_col_assignments[COLID]; int NXT_ROWID = spid * N + nxt_rowid; if (rowid != nxt_rowid && d_col_covers[COLID] == 0) { if (slack < d_col_slacks[COLID]) { - d_col_slacks[COLID] = slack; + d_col_slacks[COLID] = slack; d_col_parents[COLID] = ROWID; } @@ -92,13 +107,12 @@ __device__ void cover_and_expand_row( d_row_parents[NXT_ROWID] = COLID; // update parent info d_row_covers[NXT_ROWID] = 0; - d_col_covers[COLID] = 1; + d_col_covers[COLID] = 1; - if (d_row_visited[NXT_ROWID] != VISITED) - d_row_visited[NXT_ROWID] = ACTIVE; + if (d_row_visited[NXT_ROWID] != VISITED) d_row_visited[NXT_ROWID] = ACTIVE; } else { d_col_visited[COLID] = REVERSE; - *d_flag = true; + *d_flag = true; } } } @@ -107,28 +121,34 @@ __device__ void cover_and_expand_row( // Device function for traversing an alternating path from unassigned row to unassigned column. template -__device__ void __reverse_traversal( - int *d_row_visited, vertex_t *d_row_children, vertex_t *d_col_children, - vertex_t const *d_row_parents, vertex_t const *d_col_parents, int cur_colid) { +__device__ void __reverse_traversal(int* d_row_visited, + vertex_t* d_row_children, + vertex_t* d_col_children, + vertex_t const* d_row_parents, + vertex_t const* d_col_parents, + int cur_colid) +{ int cur_rowid = -1; while (cur_colid != -1) { d_col_children[cur_colid] = cur_rowid; - cur_rowid = d_col_parents[cur_colid]; + cur_rowid = d_col_parents[cur_colid]; d_row_children[cur_rowid] = cur_colid; - cur_colid = d_row_parents[cur_rowid]; + cur_colid = d_row_parents[cur_rowid]; } d_row_visited[cur_rowid] = AUGMENT; } // Device function for augmenting the alternating path from unassigned column to unassigned row. template -__device__ void __augment(vertex_t *d_row_assignments, - vertex_t *d_col_assignments, - vertex_t const *d_row_children, - vertex_t const *d_col_children, vertex_t cur_rowid, - vertex_t N) { +__device__ void __augment(vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + vertex_t const* d_row_children, + vertex_t const* d_col_children, + vertex_t cur_rowid, + vertex_t N) +{ int cur_colid = -1; while (cur_rowid != -1) { @@ -145,20 +165,18 @@ __device__ void __augment(vertex_t *d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_rowReduction(weight_t const *d_costs, - weight_t *d_row_duals, int SP, vertex_t N, - weight_t infinity) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; - int rowid = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void kernel_rowReduction( + weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; + int rowid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && rowid < N) { for (int colid = 0; colid < N; colid++) { weight_t slack = d_costs[spid * N * N + rowid * N + colid]; - if (slack < min) { - min = slack; - } + if (slack < min) { min = slack; } } d_row_duals[spid * N + rowid] = min; @@ -169,25 +187,26 @@ __global__ void kernel_rowReduction(weight_t const *d_costs, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_columnReduction(weight_t const *d_costs, - weight_t const *d_row_duals, - weight_t *d_col_duals, int SP, - vertex_t N, weight_t infinity) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_columnReduction(weight_t const* d_costs, + weight_t const* d_row_duals, + weight_t* d_col_duals, + int SP, + vertex_t N, + weight_t infinity) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && colid < N) { for (int rowid = 0; rowid < N; rowid++) { - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[spid * N + rowid]; weight_t slack = cost - row_dual; - if (slack < min) { - min = slack; - } + if (slack < min) { min = slack; } } d_col_duals[spid * N + colid] = min; @@ -196,12 +215,18 @@ __global__ void kernel_columnReduction(weight_t const *d_costs, // Kernel for calculating initial assignments. template -__global__ void kernel_computeInitialAssignments( - weight_t const *d_costs, weight_t const *d_row_duals, - weight_t const *d_col_duals, vertex_t *d_row_assignments, - vertex_t *d_col_assignments, int *d_row_lock, int *d_col_lock, int SP, - vertex_t N, weight_t epsilon) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeInitialAssignments(weight_t const* d_costs, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + int* d_row_lock, + int* d_col_lock, + int SP, + vertex_t N, + weight_t epsilon) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && colid < N) { @@ -213,15 +238,15 @@ __global__ void kernel_computeInitialAssignments( if (d_col_lock[overall_colid] == 1) break; - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[overall_rowid]; - weight_t slack = cost - row_dual - col_dual; + weight_t slack = cost - row_dual - col_dual; if (near_zero(slack, epsilon)) { if (atomicCAS(&d_row_lock[overall_rowid], 0, 1) == 0) { d_row_assignments[overall_rowid] = colid; d_col_assignments[overall_colid] = rowid; - d_col_lock[overall_colid] = 1; + d_col_lock[overall_colid] = 1; } } } @@ -230,10 +255,10 @@ __global__ void kernel_computeInitialAssignments( // Kernel for populating the cover arrays and initializing alternating tree. template -__global__ void kernel_computeRowCovers(vertex_t *d_row_assignments, - int *d_row_covers, int *d_row_visited, - int SP, vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeRowCovers( + vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -249,11 +274,10 @@ __global__ void kernel_computeRowCovers(vertex_t *d_row_assignments, // Kernel for populating the predicate matrix for edges in row major format. template -__global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates, - vertex_t *d_addresses, - int *d_row_visited, int SP, - vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowPredicateConstructionCSR( + bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -261,130 +285,160 @@ __global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates, if (d_row_visited[index] == ACTIVE) { d_predicates[index] = true; - d_addresses[index] = 1; + d_addresses[index] = 1; } else { d_predicates[index] = false; - d_addresses[index] = 0; + d_addresses[index] = 0; } } } // Kernel for scattering the edges based on the scatter addresses. template -__global__ void kernel_rowScatterCSR(bool const *d_predicates, - vertex_t const *d_addresses, - vertex_t *d_neighbors, vertex_t *d_ptrs, - vertex_t M, int SP, vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowScatterCSR(bool const* d_predicates, + vertex_t const* d_addresses, + vertex_t* d_neighbors, + vertex_t* d_ptrs, + vertex_t M, + int SP, + vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { int index = spid * N + rowid; - bool predicate = d_predicates[index]; + bool predicate = d_predicates[index]; vertex_t compid = d_addresses[index]; - if (predicate) { - d_neighbors[compid] = rowid; - } + if (predicate) { d_neighbors[compid] = rowid; } if (rowid == 0) { d_ptrs[spid] = compid; - d_ptrs[SP] = M; + d_ptrs[SP] = M; } } } // Kernel for finding the minimum zero cover. template -__global__ void kernel_coverAndExpand(bool *d_flag, vertex_t const *d_ptrs, - vertex_t const *d_neighbors, - weight_t const *d_elements, +__global__ void kernel_coverAndExpand(bool* d_flag, + vertex_t const* d_ptrs, + vertex_t const* d_neighbors, + weight_t const* d_elements, Vertices d_vertices, VertexData d_row_data, - VertexData d_col_data, int SP, - vertex_t N, weight_t epsilon) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; + VertexData d_col_data, + int SP, + vertex_t N, + weight_t epsilon) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; // Load values into local memory if (spid < SP && colid < N) { thrust::for_each( - thrust::seq, d_neighbors + d_ptrs[spid], d_neighbors + d_ptrs[spid + 1], - [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, - epsilon] __device__(vertex_t rowid) { - cover_and_expand_row( - d_elements, d_vertices.row_duals, d_vertices.col_duals, - d_vertices.col_slacks, d_vertices.row_covers, d_vertices.col_covers, - d_vertices.col_assignments, d_flag, d_row_data.parents, - d_col_data.parents, d_row_data.is_visited, d_col_data.is_visited, - rowid, spid, colid, N, epsilon); + thrust::seq, + d_neighbors + d_ptrs[spid], + d_neighbors + d_ptrs[spid + 1], + [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, epsilon] __device__( + vertex_t rowid) { + cover_and_expand_row(d_elements, + d_vertices.row_duals, + d_vertices.col_duals, + d_vertices.col_slacks, + d_vertices.row_covers, + d_vertices.col_covers, + d_vertices.col_assignments, + d_flag, + d_row_data.parents, + d_col_data.parents, + d_row_data.is_visited, + d_col_data.is_visited, + rowid, + spid, + colid, + N, + epsilon); }); } } // Kernel for constructing the predicates for reverse pass or augmentation candidates. template -__global__ void kernel_augmentPredicateConstruction(bool *d_predicates, - vertex_t *d_addresses, - int *d_visited, int size) { +__global__ void kernel_augmentPredicateConstruction(bool* d_predicates, + vertex_t* d_addresses, + int* d_visited, + int size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { int visited = d_visited[id]; if ((visited == REVERSE) || (visited == AUGMENT)) { d_predicates[id] = true; - d_addresses[id] = 1; + d_addresses[id] = 1; } else { d_predicates[id] = false; - d_addresses[id] = 0; + d_addresses[id] = 0; } } } // Kernel for scattering the vertices based on the scatter addresses. template -__global__ void kernel_augmentScatter(vertex_t *d_elements, - bool const *d_predicates, - vertex_t const *d_addresses, - std::size_t size) { +__global__ void kernel_augmentScatter(vertex_t* d_elements, + bool const* d_predicates, + vertex_t const* d_addresses, + std::size_t size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - if (d_predicates[id]) { - d_elements[d_addresses[id]] = id; - } + if (d_predicates[id]) { d_elements[d_addresses[id]] = id; } } } // Kernel for executing the reverse pass of the maximum matching algorithm. template -__global__ void kernel_reverseTraversal(vertex_t *d_elements, +__global__ void kernel_reverseTraversal(vertex_t* d_elements, VertexData d_row_data, VertexData d_col_data, - int size) { + int size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __reverse_traversal(d_row_data.is_visited, d_row_data.children, - d_col_data.children, d_row_data.parents, - d_col_data.parents, d_elements[id]); + __reverse_traversal(d_row_data.is_visited, + d_row_data.children, + d_col_data.children, + d_row_data.parents, + d_col_data.parents, + d_elements[id]); } } // Kernel for executing the augmentation pass of the maximum matching algorithm. template -__global__ void kernel_augmentation(vertex_t *d_row_assignments, - vertex_t *d_col_assignments, - vertex_t const *d_row_elements, +__global__ void kernel_augmentation(vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + vertex_t const* d_row_elements, VertexData d_row_data, - VertexData d_col_data, vertex_t N, - vertex_t size) { + VertexData d_col_data, + vertex_t N, + vertex_t size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __augment(d_row_assignments, d_col_assignments, d_row_data.children, - d_col_data.children, d_row_elements[id], N); + __augment(d_row_assignments, + d_col_assignments, + d_row_data.children, + d_col_data.children, + d_row_elements[id], + N); } } @@ -392,18 +446,21 @@ __global__ void kernel_augmentation(vertex_t *d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_1(weight_t *d_sp_min, - weight_t const *d_col_slacks, - int const *d_col_covers, int SP, vertex_t N, - weight_t infinity) { +__global__ void kernel_dualUpdate_1(weight_t* d_sp_min, + weight_t const* d_col_slacks, + int const* d_col_covers, + int SP, + vertex_t N, + weight_t infinity) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { weight_t min = infinity; for (int colid = 0; colid < N; colid++) { - int index = spid * N + colid; + int index = spid * N + colid; weight_t slack = d_col_slacks[index]; - int col_cover = d_col_covers[index]; + int col_cover = d_col_covers[index]; if (col_cover == 0) if (slack < min) min = slack; @@ -417,21 +474,29 @@ __global__ void kernel_dualUpdate_1(weight_t *d_sp_min, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_2( - weight_t const *d_sp_min, weight_t *d_row_duals, weight_t *d_col_duals, - weight_t *d_col_slacks, int const *d_row_covers, int const *d_col_covers, - int *d_row_visited, vertex_t *d_col_parents, int SP, vertex_t N, - weight_t infinity, weight_t epsilon) { +__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min, + weight_t* d_row_duals, + weight_t* d_col_duals, + weight_t* d_col_slacks, + int const* d_row_covers, + int const* d_col_covers, + int* d_row_visited, + vertex_t* d_col_parents, + int SP, + vertex_t N, + weight_t infinity, + weight_t epsilon) +{ int spid = blockIdx.y * blockDim.y + threadIdx.y; - int id = blockIdx.x * blockDim.x + threadIdx.x; + int id = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && id < N) { int index = spid * N + id; if (d_sp_min[spid] < infinity) { weight_t theta = d_sp_min[spid]; - int row_cover = d_row_covers[index]; - int col_cover = d_col_covers[index]; + int row_cover = d_row_covers[index]; + int col_cover = d_col_covers[index]; if (row_cover == 0) // Row vertex is reachable from source. d_row_duals[index] += theta; @@ -453,10 +518,12 @@ __global__ void kernel_dualUpdate_2( // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual, - weight_t const *d_row_duals, - weight_t const *d_col_duals, int SP, - vertex_t N) { +__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + int SP, + vertex_t N) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { @@ -471,10 +538,12 @@ __global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual, // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValPrimal(weight_t *d_obj_val_primal, - weight_t const *d_costs, - vertex_t const *d_row_assignments, - int SP, vertex_t N) { +__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal, + weight_t const* d_costs, + vertex_t const* d_row_assignments, + int SP, + vertex_t N) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index 7a454f64e2..11d3174951 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -37,8 +37,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, - cudaStream_t stream) { +void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ auto op = [scalar] __device__(InT in) { return OutT(in + scalar); }; unaryOp(out, in, len, op, stream); } @@ -57,23 +57,24 @@ void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, * @param stream cuda stream where to launch work */ template -void add(OutT *out, const InT *in1, const InT *in2, IdxType len, - cudaStream_t stream) { +void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ auto op = [] __device__(InT a, InT b) { return OutT(a + b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { +__global__ void add_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] + *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and + * write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -83,14 +84,16 @@ __global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, * @param stream cuda stream */ template -void addDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void addDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // TODO: block dimension has not been tuned dim3 block(256); dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel - <<>>(outDev, inDev, singleScalarDev, len); + add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index 940d786e87..a49a433941 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -22,10 +22,10 @@ namespace raft { namespace linalg { -template -__global__ void binaryOpKernel(OutType *out, const InType *in1, - const InType *in2, IdxType len, Lambda op) { +template +__global__ void binaryOpKernel( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op) +{ typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a, b; @@ -42,12 +42,11 @@ __global__ void binaryOpKernel(OutType *out, const InType *in1, c.store(out, idx); } -template -void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, - IdxType len, Lambda op, cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void binaryOpImpl( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); binaryOpKernel <<>>(out, in1, in2, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -56,8 +55,8 @@ void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, /** * @brief Checks if addresses are aligned on N bytes */ -inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, - uint64_t N) { +inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N) +{ return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; } @@ -77,38 +76,36 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val1, const InType& val2);` */ -template -void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, - Lambda op, cudaStream_t stream) { - constexpr auto maxSize = - sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t in1Addr = uint64_t(in1); - uint64_t in2Addr = uint64_t(in2); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 16)) { +template +void binaryOp( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t in1Addr = uint64_t(in1); + uint64_t in2Addr = uint64_t(in2); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 8)) { + } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 4)) { + } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 2)) { + } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) { binaryOpImpl( out, in1, in2, len, op, stream); } else if (1 / maxSize) { binaryOpImpl( out, in1, in2, len, op, stream); } else { - binaryOpImpl(out, in1, in2, len, - op, stream); + binaryOpImpl(out, in1, in2, len, op, stream); } } diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index b5a93c4953..b129fe4758 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -122,9 +122,16 @@ namespace linalg { * conditioned systems. Negative values mean no regularizaton. */ template -void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, - void *workspace, int *n_bytes, cublasFillMode_t uplo, - cudaStream_t stream, math_t eps = -1) { +void choleskyRank1Update(const raft::handle_t& handle, + math_t* L, + int n, + int ld, + void* workspace, + int* n_bytes, + cublasFillMode_t uplo, + cudaStream_t stream, + math_t eps = -1) +{ // The matrix A' is defined as: // A' = [[A_11, A_12] // [A_21, A_22]] @@ -144,18 +151,17 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // We need a workspace in device memory to store a scalar. Additionally, in // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. const int align = 256; - int offset = (uplo == CUBLAS_FILL_MODE_LOWER) - ? raft::alignTo(sizeof(math_t) * (n - 1), align) - : 0; + int offset = + (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo(sizeof(math_t) * (n - 1), align) : 0; if (workspace == nullptr) { *n_bytes = offset + 1 * sizeof(math_t); return; } - math_t *s = reinterpret_cast(((char *)workspace) + offset); - math_t *L_22 = L + (n - 1) * ld + n - 1; + math_t* s = reinterpret_cast(((char*)workspace) + offset); + math_t* L_22 = L + (n - 1) * ld + n - 1; - math_t *A_new; - math_t *A_row; + math_t* A_new; + math_t* A_row; if (uplo == CUBLAS_FILL_MODE_UPPER) { // A_new is stored as the n-1 th column of L A_new = L + (n - 1) * ld; @@ -164,27 +170,36 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // as the n-th row of L. Since the matrix is column major, this is non // contiguous. We copy elements from A_row to a contiguous workspace A_new. A_row = L + n - 1; - A_new = reinterpret_cast(workspace); - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_row, ld, A_new, 1, stream)); + A_new = reinterpret_cast(workspace); + CUBLAS_CHECK( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); } - cublasOperation_t op = - (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; if (n > 1) { // Calculate L_12 = x by solving equation L_11 x = A_12 math_t alpha = 1; - CUBLAS_CHECK(raft::linalg::cublastrsm( - handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op, - CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream)); + CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(), + CUBLAS_SIDE_LEFT, + uplo, + op, + CUBLAS_DIAG_NON_UNIT, + n - 1, + 1, + &alpha, + L, + ld, + A_new, + n - 1, + stream)); // A_new now stores L_12, we calculate s = L_12 * L_12 - CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, - A_new, 1, A_new, 1, s, stream)); + CUBLAS_CHECK( + raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); if (uplo == CUBLAS_FILL_MODE_LOWER) { // Copy back the L_12 elements as the n-th row of L - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_new, 1, A_row, ld, stream)); + CUBLAS_CHECK( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); } } else { // n == 1 case CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); @@ -202,9 +217,7 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // the system is very ill conditioned then the A_22 - L_12 * L_12 can be // negative, which would result L_22 = NaN. A small positive eps parameter // can be used to prevent this. - if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { - L_22_host = eps; - } + if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; } ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); raft::update_device(L_22, &L_22_host, 1, stream); } diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh index ef983ff3d0..7e0744f98a 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -26,18 +26,27 @@ namespace linalg { // of the matrix, i.e. reduce along rows for row major or reduce along columns // for column major layout. Kernel does an inplace reduction adding to original // values of dots. -template -__global__ void coalescedReductionKernel(OutType *dots, const InType *data, - int D, int N, OutType init, +template +__global__ void coalescedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, MainLambda main_op, ReduceLambda reduce_op, FinalLambda final_op, - bool inplace = false) { + bool inplace = false) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType thread_data = init; - IdxType rowStart = blockIdx.x * D; + IdxType rowStart = blockIdx.x * D; for (IdxType i = threadIdx.x; i < D; i += TPB) { IdxType idx = rowStart + i; thread_data = reduce_op(thread_data, main_op(data[idx], i)); @@ -79,33 +88,37 @@ __global__ void coalescedReductionKernel(OutType *dots, const InType *data, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void coalescedReduction(OutType *dots, const InType *data, int D, int N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void coalescedReduction(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ // One block per reduction // Efficient only for large leading dimensions if (D <= 32) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else if (D <= 64) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else if (D <= 128) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index aa711a9140..35d9d96ea4 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -55,8 +55,7 @@ namespace linalg { * thread block. This also determines the number of threads per * thread block */ -template +template struct KernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -101,8 +100,7 @@ struct KernelPolicy { }; // struct KernelPolicy -template +template struct ColKernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -151,7 +149,8 @@ struct ColKernelPolicy { * @{ */ template -struct Policy4x4 {}; +struct Policy4x4 { +}; template struct Policy4x4 { @@ -180,8 +179,7 @@ struct Policy4x4 { * @tparam Policy policy used to customize memory access behavior. * See documentation for `KernelPolicy` to know more. */ -template +template struct Contractions_NT { protected: typedef Policy P; @@ -247,8 +245,7 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, char* _smem) + DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem) : m(_m), n(_n), k(_k), @@ -265,7 +262,9 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) {} + pageRd(0) + { + } /** * @brief Ctor @@ -276,8 +275,15 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) + DI Contractions_NT(const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + IdxT _lda, + IdxT _ldb, + IdxT _ldd, + char* _smem) : m(_m), n(_n), k(_k), @@ -291,17 +297,18 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) { + pageRd(0) + { if (isRowMajor) { xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; - x = _x + xrowid * lda; - y = _y + yrowid * ldb; + x = _x + xrowid * lda; + y = _y + yrowid * ldb; } else { xrowid = IdxT(blockIdx.y) * P::Mblk; yrowid = IdxT(blockIdx.x) * P::Nblk; - x = _x + xrowid + srowid * lda; - y = _y + yrowid + srowid * ldb; + x = _x + xrowid + srowid * lda; + y = _y + yrowid + srowid * ldb; } } @@ -310,7 +317,8 @@ struct Contractions_NT { * @brief Load current block of X/Y from global memory to registers * @param[in] kidx current start index of k to be loaded */ - DI void ldgXY(IdxT kidx) { + DI void ldgXY(IdxT kidx) + { ldgX(kidx); ldgY(kidx); } @@ -319,7 +327,8 @@ struct Contractions_NT { * @brief Store current block of X/Y from registers to smem * @param[in] kidx current start index of k to be loaded */ - DI void stsXY() { + DI void stsXY() + { stsX(sx + pageWr * P::SmemPage); stsY(sy + pageWr * P::SmemPage); } @@ -328,13 +337,15 @@ struct Contractions_NT { * @brief Load X and Y block from shared memory to registers * @param[in] kidx k value from the current k-block to be loaded from smem */ - DI void ldsXY(int kidx) { + DI void ldsXY(int kidx) + { ldsX(kidx, sx + pageRd * P::SmemPage); ldsY(kidx, sy + pageRd * P::SmemPage); } private: - DI void ldgX(IdxT kidx) { + DI void ldgX(IdxT kidx) + { if (isRowMajor) { auto numRows = m; auto koffset = kidx + scolid; @@ -351,11 +362,10 @@ struct Contractions_NT { } } else { const auto numRows = k; - auto koffset = scolid; + auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { - if ((koffset + xrowid) < lda && - (srowid + kidx + i * P::LdgRowsX) < numRows) { + if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) { ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); } else { #pragma unroll @@ -367,7 +377,8 @@ struct Contractions_NT { } } - DI void ldgY(IdxT kidx) { + DI void ldgY(IdxT kidx) + { if (isRowMajor) { auto numRows = n; auto koffset = kidx + scolid; @@ -387,8 +398,7 @@ struct Contractions_NT { auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { - if ((koffset + yrowid) < ldb && - (srowid + kidx + i * P::LdgRowsY) < numRows) { + if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) { ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); } else { #pragma unroll @@ -400,7 +410,8 @@ struct Contractions_NT { } } - DI void stsX(DataT* smem) { + DI void stsX(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { @@ -408,7 +419,8 @@ struct Contractions_NT { } } - DI void stsY(DataT* smem) { + DI void stsY(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { @@ -416,7 +428,8 @@ struct Contractions_NT { } } - DI void ldsX(int kidx, DataT* smem) { + DI void ldsX(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + accrowid * P::SmemStride + kidx; #pragma unroll @@ -435,7 +448,8 @@ struct Contractions_NT { } } - DI void ldsY(int kidx, DataT* smem) { + DI void ldsY(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + acccolid * P::SmemStride + kidx; #pragma unroll diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 7c79e6c91d..2d18691410 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -25,8 +25,7 @@ #include #define _CUBLAS_ERR_TO_STR(err) \ - case err: \ - return #err + case err: return #err namespace raft { @@ -34,15 +33,15 @@ namespace raft { * @brief Exception thrown when a cuBLAS error is encountered. */ struct cublas_error : public raft::exception { - explicit cublas_error(char const *const message) : raft::exception(message) {} - explicit cublas_error(std::string const &message) - : raft::exception(message) {} + explicit cublas_error(char const* const message) : raft::exception(message) {} + explicit cublas_error(std::string const& message) : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char *cublas_error_to_string(cublasStatus_t err) { +inline const char* cublas_error_to_string(cublasStatus_t err) +{ switch (err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED); @@ -54,8 +53,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR); - default: - return "CUBLAS_STATUS_UNKNOWN"; + default: return "CUBLAS_STATUS_UNKNOWN"; }; } @@ -71,16 +69,19 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { * Invokes a cuBLAS runtime API function call, if the call does not return * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred */ -#define CUBLAS_TRY(call) \ - do { \ - cublasStatus_t const status = (call); \ - if (CUBLAS_STATUS_SUCCESS != status) { \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, raft::linalg::detail::cublas_error_to_string(status)); \ - throw raft::cublas_error(msg); \ - } \ +#define CUBLAS_TRY(call) \ + do { \ + cublasStatus_t const status = (call); \ + if (CUBLAS_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "cuBLAS error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ + raft::linalg::detail::cublas_error_to_string(status)); \ + throw raft::cublas_error(msg); \ + } \ } while (0) /** FIXME: temporary alias for cuML compatibility */ @@ -107,22 +108,39 @@ namespace linalg { * @{ */ template -cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha, - const T *x, int incx, T *y, int incy, +cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const T* alpha, + const T* x, + int incx, + T* y, + int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, - const float *alpha, const float *x, int incx, - float *y, int incy, cudaStream_t stream) { +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const float* alpha, + const float* x, + int incx, + float* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSaxpy(handle, n, alpha, x, incx, y, incy); } template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, - const double *alpha, const double *x, int incx, - double *y, int incy, cudaStream_t stream) { +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const double* alpha, + const double* x, + int incx, + double* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } @@ -133,21 +151,21 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, * @{ */ template -cublasStatus_t cublasSwap(cublasHandle_t handle, int n, T *x, int incx, T *y, - int incy, cudaStream_t stream); +cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, T* x, int incx, T* y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, float *x, - int incx, float *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSswap(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x, - int incx, double *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDswap(handle, n, x, incx, y, incy); } @@ -159,20 +177,20 @@ inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x, * @{ */ template -cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx, - T *y, int incy, cudaStream_t stream); +cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x, - int incx, float *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasScopy(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, - int incx, double *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDcopy(handle, n, x, incx, y, incy); } @@ -183,31 +201,56 @@ inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, * @{ */ template -cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA, - int m, int n, const T *alfa, const T *A, int lda, - const T *x, int incx, const T *beta, T *y, int incy, +cublasStatus_t cublasgemv(cublasHandle_t handle, + cublasOperation_t transA, + int m, + int n, + const T* alfa, + const T* A, + int lda, + const T* x, + int incx, + const T* beta, + T* y, + int incy, cudaStream_t stream); template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, int m, int n, - const float *alfa, const float *A, int lda, - const float *x, int incx, const float *beta, - float *y, int incy, cudaStream_t stream) { + cublasOperation_t transA, + int m, + int n, + const float* alfa, + const float* A, + int lda, + const float* x, + int incx, + const float* beta, + float* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, - incy); + return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, int m, int n, - const double *alfa, const double *A, int lda, - const double *x, int incx, const double *beta, - double *y, int incy, cudaStream_t stream) { + cublasOperation_t transA, + int m, + int n, + const double* alfa, + const double* A, + int lda, + const double* x, + int incx, + const double* beta, + double* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, - incy); + return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } /** @} */ @@ -216,23 +259,47 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha, - const T *x, int incx, const T *y, int incy, T *A, - int lda, cudaStream_t stream); +cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const T* alpha, + const T* x, + int incx, + const T* y, + int incy, + T* A, + int lda, + cudaStream_t stream); template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, - const float *alpha, const float *x, int incx, - const float *y, int incy, float *A, int lda, - cudaStream_t stream) { +inline cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const float* alpha, + const float* x, + int incx, + const float* y, + int incy, + float* A, + int lda, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); } template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, - const double *alpha, const double *x, int incx, - const double *y, int incy, double *A, int lda, - cudaStream_t stream) { +inline cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const double* alpha, + const double* x, + int incx, + const double* y, + int incy, + double* A, + int lda, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); } @@ -243,34 +310,62 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, * @{ */ template -cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const T *alfa, const T *A, int lda, const T *B, - int ldb, const T *beta, T *C, int ldc, +cublasStatus_t cublasgemm(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, + int m, + int n, + int k, + const T* alfa, + const T* A, + int lda, + const T* B, + int ldb, + const T* beta, + T* C, + int ldc, cudaStream_t stream); template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const float *alfa, const float *A, int lda, - const float *B, int ldb, const float *beta, - float *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + int k, + const float* alfa, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, - beta, C, ldc); + return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const double *alfa, const double *A, int lda, - const double *B, int ldb, const double *beta, - double *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + int k, + const double* alfa, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, - beta, C, ldc); + return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -281,38 +376,93 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, template cublasStatus_t cublasgemmBatched(cublasHandle_t handle, // NOLINT cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const T *alpha, - const T *const Aarray[], // NOLINT - int lda, const T *const Barray[], // NOLINT - int ldb, const T *beta, - T *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream); + cublasOperation_t transb, + int m, + int n, + int k, + const T* alpha, + const T* const Aarray[], // NOLINT + int lda, + const T* const Barray[], // NOLINT + int ldb, + const T* beta, + T* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream); template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const float *alpha, - const float *const Aarray[], // NOLINT - int lda, const float *const Barray[], // NOLINT - int ldb, const float *beta, float *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* const Aarray[], // NOLINT + int lda, + const float* const Barray[], // NOLINT + int ldb, + const float* beta, + float* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasSgemmBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + Barray, + ldb, + beta, + Carray, + ldc, + batchCount); } template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const double *alpha, - const double *const Aarray[], // NOLINT - int lda, const double *const Barray[], // NOLINT - int ldb, const double *beta, double *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* const Aarray[], // NOLINT + int lda, + const double* const Barray[], // NOLINT + int ldb, + const double* beta, + double* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasDgemmBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + Barray, + ldb, + beta, + Carray, + ldc, + batchCount); } /** @} */ @@ -322,36 +472,110 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT */ template cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const T *alpha, const T *const Aarray, int lda, - int64_t strideA, const T *const Barray, int ldb, int64_t strideB, - const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount, + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const T* alpha, + const T* const Aarray, + int lda, + int64_t strideA, + const T* const Barray, + int ldb, + int64_t strideB, + const T* beta, + T* Carray, + int ldc, + int64_t strideC, + int batchCount, cudaStream_t stream); template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const float *alpha, const float *const Aarray, int lda, - int64_t strideA, const float *const Barray, int ldb, int64_t strideB, - const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount, - cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* const Aarray, + int lda, + int64_t strideA, + const float* const Barray, + int ldb, + int64_t strideB, + const float* beta, + float* Carray, + int ldc, + int64_t strideC, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, - Aarray, lda, strideA, Barray, ldb, strideB, - beta, Carray, ldc, strideC, batchCount); + return cublasSgemmStridedBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + strideA, + Barray, + ldb, + strideB, + beta, + Carray, + ldc, + strideC, + batchCount); } template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const double *alpha, const double *const Aarray, int lda, - int64_t strideA, const double *const Barray, int ldb, int64_t strideB, - const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount, - cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* const Aarray, + int lda, + int64_t strideA, + const double* const Barray, + int ldb, + int64_t strideB, + const double* beta, + double* Carray, + int ldc, + int64_t strideC, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, - Aarray, lda, strideA, Barray, ldb, strideB, - beta, Carray, ldc, strideC, batchCount); + return cublasDgemmStridedBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + strideA, + Barray, + ldb, + strideB, + beta, + Carray, + ldc, + strideC, + batchCount); } /** @} */ @@ -361,51 +585,85 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT */ template -cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n, // NOLINT - T *const A[], // NOLINT - int lda, int *P, int *info, int batchSize, +cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, + int n, // NOLINT + T* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, float *const A[], // NOLINT - int lda, int *P, int *info, - int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, + float* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); } template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, double *const A[], // NOLINT - int lda, int *P, int *info, - int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, + double* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); } template -cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n, // NOLINT - const T *const A[], // NOLINT - int lda, const int *P, - T *const C[], // NOLINT - int ldc, int *info, int batchSize, +cublasStatus_t cublasgetriBatched(cublasHandle_t handle, + int n, // NOLINT + const T* const A[], // NOLINT + int lda, + const int* P, + T* const C[], // NOLINT + int ldc, + int* info, + int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, int n, const float *const A[], // NOLINT - int lda, const int *P, float *const C[], // NOLINT - int ldc, int *info, int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, + int n, + const float* const A[], // NOLINT + int lda, + const int* P, + float* const C[], // NOLINT + int ldc, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, int n, const double *const A[], // NOLINT - int lda, const int *P, double *const C[], // NOLINT - int ldc, int *info, int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, + int n, + const double* const A[], // NOLINT + int lda, + const int* P, + double* const C[], // NOLINT + int ldc, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -419,34 +677,57 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT template inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, T *Aarray[], // NOLINT - int lda, T *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream); + cublasOperation_t trans, + int m, + int n, + int nrhs, + T* Aarray[], // NOLINT + int lda, + T* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream); template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, float *Aarray[], // NOLINT - int lda, float *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream) { + cublasOperation_t trans, + int m, + int n, + int nrhs, + float* Aarray[], // NOLINT + int lda, + float* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, - info, devInfoArray, batchSize); + return cublasSgelsBatched( + handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, double *Aarray[], // NOLINT - int lda, double *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream) { + cublasOperation_t trans, + int m, + int n, + int nrhs, + double* Aarray[], // NOLINT + int lda, + double* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, - info, devInfoArray, batchSize); + return cublasDgelsBatched( + handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } /** @} */ @@ -456,33 +737,59 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT * @{ */ template -cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, const T *alfa, - const T *A, int lda, const T *beta, const T *B, - int ldb, T *C, int ldc, cudaStream_t stream); +cublasStatus_t cublasgeam(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, + int m, + int n, + const T* alfa, + const T* A, + int lda, + const T* beta, + const T* B, + int ldb, + T* C, + int ldc, + cudaStream_t stream); template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, - const float *alfa, const float *A, int lda, - const float *beta, const float *B, int ldb, - float *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + const float* alfa, + const float* A, + int lda, + const float* beta, + const float* B, + int ldb, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, - C, ldc); + return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, - const double *alfa, const double *A, int lda, - const double *beta, const double *B, int ldb, - double *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + const double* alfa, + const double* A, + int lda, + const double* beta, + const double* B, + int ldb, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, - C, ldc); + return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } /** @} */ @@ -491,31 +798,59 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, const T *alpha, - const T *A, int lda, const T *B, int ldb, - const T *beta, T *C, int ldc, cudaStream_t stream); +cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const T* alpha, + const T* A, + int lda, + const T* B, + int ldb, + const T* beta, + T* C, + int ldc, + cudaStream_t stream); template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, - const float *alpha, const float *A, int lda, - const float *B, int ldb, const float *beta, - float *C, int ldc, cudaStream_t stream) { +inline cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const float* alpha, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, - ldc); + return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, - const double *alpha, const double *A, int lda, - const double *B, int ldb, const double *beta, - double *C, int ldc, cudaStream_t stream) { +inline cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const double* alpha, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, - ldc); + return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -524,27 +859,51 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, * @{ */ template -cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, const T *alpha, - const T *A, int lda, const T *beta, T *C, int ldc, +cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const T* alpha, + const T* A, + int lda, + const T* beta, + T* C, + int ldc, cudaStream_t stream); template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, - const float *alpha, const float *A, int lda, - const float *beta, float *C, int ldc, - cudaStream_t stream) { +inline cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, + const float* A, + int lda, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, - const double *alpha, const double *A, int lda, - const double *beta, double *C, int ldc, - cudaStream_t stream) { +inline cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, + const double* A, + int lda, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } @@ -555,52 +914,77 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, * @{ */ template -cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx, - T *result, cudaStream_t stream); +cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const T* x, int incx, T* result, cudaStream_t stream); template <> -inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x, - int incx, float *result, cudaStream_t stream) { +inline cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSnrm2(handle, n, x, incx, result); } template <> -inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x, - int incx, double *result, - cudaStream_t stream) { +inline cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDnrm2(handle, n, x, incx, result); } /** @} */ template -cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, const T *alpha, - const T *A, int lda, T *B, int ldb, +cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const T* alpha, + const T* A, + int lda, + T* B, + int ldb, cudaStream_t stream); template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, - const float *alpha, const float *A, int lda, - float *B, int ldb, cudaStream_t stream) { +inline cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, + const float* A, + int lda, + float* B, + int ldb, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, - ldb); + return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, - const double *alpha, const double *A, int lda, - double *B, int ldb, cudaStream_t stream) { +inline cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, + const double* A, + int lda, + double* B, + int ldb, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, - ldb); + return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } /** @@ -608,21 +992,39 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, * @{ */ template -cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx, - const T *y, int incy, T *result, cudaStream_t stream); +cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const T* x, + int incx, + const T* y, + int incy, + T* result, + cudaStream_t stream); template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x, - int incx, const float *y, int incy, - float *result, cudaStream_t stream) { +inline cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const float* x, + int incx, + const float* y, + int incy, + float* result, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSdot(handle, n, x, incx, y, incy, result); } template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, - int incx, const double *y, int incy, - double *result, cudaStream_t stream) { +inline cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const double* x, + int incx, + const double* y, + int incy, + double* result, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDdot(handle, n, x, incx, y, incy, result); } @@ -642,7 +1044,8 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, // template<> inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, cublasPointerMode_t mode, - cudaStream_t stream) { + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSetPointerMode(handle, mode); } @@ -653,21 +1056,21 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x, - int incx, cudaStream_t stream); +cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const T* alpha, T* x, int incx, cudaStream_t stream); template <> -inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, - const float *alpha, float *x, int incx, - cudaStream_t stream) { +inline cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSscal(handle, n, alpha, x, incx); } template <> -inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, - const double *alpha, double *x, int incx, - cudaStream_t stream) { +inline cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDscal(handle, n, alpha, x, incx); } diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index 0eadf47fe3..76a9f40f4d 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -24,8 +24,7 @@ #include #define _CUSOLVER_ERR_TO_STR(err) \ - case err: \ - return #err; + case err: return #err; namespace raft { @@ -33,16 +32,15 @@ namespace raft { * @brief Exception thrown when a cuSOLVER error is encountered. */ struct cusolver_error : public raft::exception { - explicit cusolver_error(char const *const message) - : raft::exception(message) {} - explicit cusolver_error(std::string const &message) - : raft::exception(message) {} + explicit cusolver_error(char const* const message) : raft::exception(message) {} + explicit cusolver_error(std::string const& message) : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char *cusolver_error_to_string(cusolverStatus_t err) { +inline const char* cusolver_error_to_string(cusolverStatus_t err) +{ switch (err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED); @@ -54,8 +52,7 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED); - default: - return "CUSOLVER_STATUS_UNKNOWN"; + default: return "CUSOLVER_STATUS_UNKNOWN"; }; } @@ -76,8 +73,11 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { cusolverStatus_t const status = (call); \ if (CUSOLVER_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ", \ - "call='%s', Reason=%d:%s", #call, status, \ + SET_ERROR_MSG(msg, \ + "cuSOLVER error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ raft::linalg::detail::cusolver_error_to_string(status)); \ throw raft::cusolver_error(msg); \ } \ @@ -107,42 +107,76 @@ namespace linalg { * @{ */ template -cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m, // NOLINT - int n, T *A, int lda, T *Workspace, - int *devIpiv, int *devInfo, +cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, + int m, // NOLINT + int n, + T* A, + int lda, + T* Workspace, + int* devIpiv, + int* devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, float *A, int lda, - float *Workspace, int *devIpiv, - int *devInfo, cudaStream_t stream) { + int m, + int n, + float* A, + int lda, + float* Workspace, + int* devIpiv, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, double *A, int lda, - double *Workspace, int *devIpiv, - int *devInfo, cudaStream_t stream) { + int m, + int n, + double* A, + int lda, + double* Workspace, + int* devIpiv, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); + cusolverDnHandle_t handle, + int m, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork); } @@ -152,30 +186,49 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, int nrhs, - const T *A, int lda, const int *devIpiv, T *B, - int ldb, int *devInfo, cudaStream_t stream); + cublasOperation_t trans, + int n, + int nrhs, + const T* A, + int lda, + const int* devIpiv, + T* B, + int ldb, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, - int nrhs, const float *A, int lda, - const int *devIpiv, float *B, int ldb, - int *devInfo, cudaStream_t stream) { + cublasOperation_t trans, + int n, + int nrhs, + const float* A, + int lda, + const int* devIpiv, + float* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, - devInfo); + return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, - int nrhs, const double *A, int lda, - const int *devIpiv, double *B, int ldb, - int *devInfo, cudaStream_t stream) { + cublasOperation_t trans, + int n, + int nrhs, + const double* A, + int lda, + const int* devIpiv, + double* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, - devInfo); + return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } /** @} */ @@ -185,20 +238,40 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const T *A, int lda, const T *W, int *lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + const T* W, + int* lwork); template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const float *A, int lda, const float *W, int *lwork) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + const float* W, + int* lwork) +{ return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const double *A, int lda, const double *W, int *lwork) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + const double* W, + int* lwork) +{ return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } /** @} */ @@ -209,52 +282,96 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, T *A, int lda, T *W, T *work, int lwork, - int *info, syevjInfo_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* W, + T* work, + int lwork, + int* info, + syevjInfo_t params, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, float *A, int lda, float *W, float *work, int lwork, int *info, - syevjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* W, + float* work, + int lwork, + int* info, + syevjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, - params); + return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, double *A, int lda, double *W, double *work, int lwork, int *info, - syevjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* W, + double* work, + int lwork, + int* info, + syevjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, - params); + return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } template cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + const T* W, + int* lwork, + syevjInfo_t params); template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const float *A, int lda, const float *W, int *lwork, - syevjInfo_t params) { - return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, - params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + const float* W, + int* lwork, + syevjInfo_t params) +{ + return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const double *A, int lda, const double *W, int *lwork, - syevjInfo_t params) { - return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, - params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + const double* W, + int* lwork, + syevjInfo_t params) +{ + return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } /** @} */ @@ -264,32 +381,49 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, T *A, int lda, T *W, T *work, int lwork, - int *devInfo, cudaStream_t stream); + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* W, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, int n, float *A, - int lda, float *W, float *work, - int lwork, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* W, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, - devInfo); + return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, int n, double *A, - int lda, double *W, double *work, - int lwork, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* W, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, - devInfo); + return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } /** @} */ @@ -297,57 +431,134 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT /** * @defgroup syevdx cusolver syevdx operations * @{ -*/ + */ template cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu, - int *h_meig, const T *W, int *lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + T vl, + T vu, + int il, + int iu, + int* h_meig, + const T* W, + int* lwork); template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu, - int il, int iu, int *h_meig, const float *W, int *lwork) { - return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, - vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + float vl, + float vu, + int il, + int iu, + int* h_meig, + const float* W, + int* lwork) +{ + return cusolverDnSsyevdx_bufferSize( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu, - int il, int iu, int *h_meig, const double *W, int *lwork) { - return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, - vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + double vl, + double vu, + int il, + int iu, + int* h_meig, + const double* W, + int* lwork) +{ + return cusolverDnDsyevdx_bufferSize( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } template cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu, - int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T vl, + T vu, + int il, + int iu, + int* h_meig, + T* W, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il, - int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float vl, + float vu, + int il, + int iu, + int* h_meig, + float* W, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, - h_meig, W, work, lwork, devInfo); + return cusolverDnSsyevdx( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu, - int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double vl, + double vu, + int il, + int iu, + int* h_meig, + double* W, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, - h_meig, W, work, lwork, devInfo); + return cusolverDnDsyevdx( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } /** @} */ #endif @@ -358,7 +569,11 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT */ template cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int* lwork) +{ if (std::is_same, float>::value) { return cusolverDnSgesvd_bufferSize(handle, m, n, lwork); } else { @@ -367,72 +582,194 @@ cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT } template cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork, - T *rwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + T* A, + int lda, + T* S, + T* U, + int ldu, + T* VT, + int ldvt, + T* work, + int lwork, + T* rwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, - float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + float* A, + int lda, + float* S, + float* U, + int ldu, + float* VT, + int ldvt, + float* work, + int lwork, + float* rwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, - ldvt, work, lwork, rwork, devInfo); + return cusolverDnSgesvd( + handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, - double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + double* A, + int lda, + double* S, + double* U, + int ldu, + double* VT, + int ldvt, + double* work, + int lwork, + double* rwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, - ldvt, work, lwork, rwork, devInfo); + return cusolverDnDgesvd( + handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv, - int *lwork, gesvdjInfo_t params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const T* A, + int lda, + const T* S, + const T* U, + int ldu, + const T* V, + int ldv, + int* lwork, + gesvdjInfo_t params); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const float *A, int lda, const float *S, const float *U, int ldu, - const float *V, int ldv, int *lwork, gesvdjInfo_t params) { - return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, - ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const float* A, + int lda, + const float* S, + const float* U, + int ldu, + const float* V, + int ldv, + int* lwork, + gesvdjInfo_t params) +{ + return cusolverDnSgesvdj_bufferSize( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const double *A, int lda, const double *S, const double *U, int ldu, - const double *V, int ldv, int *lwork, gesvdjInfo_t params) { - return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, - ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const double* A, + int lda, + const double* S, + const double* U, + int ldu, + const double* V, + int ldv, + int* lwork, + gesvdjInfo_t params) +{ + return cusolverDnDgesvdj_bufferSize( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork, - int *info, gesvdjInfo_t params, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + T* A, + int lda, + T* S, + T* U, + int ldu, + T* V, + int ldv, + T* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - float *A, int lda, float *S, float *U, int ldu, float *V, int ldv, - float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + float* A, + int lda, + float* S, + float* U, + int ldu, + float* V, + int ldv, + float* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, - work, lwork, info, params); + return cusolverDnSgesvdj( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - double *A, int lda, double *S, double *U, int ldu, double *V, int ldv, - double *work, int lwork, int *info, gesvdjInfo_t params, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + double* A, + int lda, + double* S, + double* U, + int ldu, + double* V, + int ldv, + double* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, - work, lwork, info, params); + return cusolverDnDgesvdj( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } /** @} */ @@ -442,43 +779,74 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT */ template cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda, - int *Lwork); + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda, - int *Lwork) { + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda, - int *Lwork) { + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, T *A, - int lda, T *Workspace, int Lwork, - int *devInfo, cudaStream_t stream); + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, float *A, - int lda, float *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, double *A, - int lda, double *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } @@ -490,26 +858,44 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const T *A, int lda, T *B, int ldb, - int *devInfo, cudaStream_t stream); + cublasFillMode_t uplo, + int n, + int nrhs, + const T* A, + int lda, + T* B, + int ldb, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const float *A, int lda, float *B, - int ldb, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + int nrhs, + const float* A, + int lda, + float* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const double *A, int lda, double *B, - int ldb, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + int nrhs, + const double* A, + int lda, + double* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } @@ -520,38 +906,75 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT * @{ */ template -cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m, // NOLINT - int n, T *A, int lda, T *TAU, T *Workspace, - int Lwork, int *devInfo, cudaStream_t stream); +cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, + int m, // NOLINT + int n, + T* A, + int lda, + T* TAU, + T* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, float *A, int lda, - float *TAU, float *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + int m, + int n, + float* A, + int lda, + float* TAU, + float* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, double *A, int lda, - double *TAU, double *Workspace, - int Lwork, int *devInfo, - cudaStream_t stream) { + int m, + int n, + double* A, + int lda, + double* TAU, + double* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); + cusolverDnHandle_t handle, + int m, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } /** @} */ @@ -562,38 +985,86 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau, - T *work, int lwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + int m, + int n, + int k, + T* A, + int lda, + const T* tau, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda, - const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + float* A, + int lda, + const float* tau, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda, - const double *tau, double *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + double* A, + int lda, + const double* tau, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda, - const T *TAU, int *lwork); + cusolverDnHandle_t handle, + int m, + int n, + int k, + const T* A, + int lda, + const T* TAU, + int* lwork); template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda, - const float *TAU, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + const float* A, + int lda, + const float* TAU, + int* lwork) +{ return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda, - const double *TAU, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + const double* A, + int lda, + const double* TAU, + int* lwork) +{ return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } /** @} */ @@ -604,53 +1075,114 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle, // NOLINT - cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const T *A, int lda, - const T *tau, T *C, int ldc, T *work, - int lwork, int *devInfo, cudaStream_t stream); + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const T* A, + int lda, + const T* tau, + T* C, + int ldc, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const float *A, int lda, const float *tau, float *C, - int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const float* A, + int lda, + const float* tau, + float* C, + int ldc, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, - work, lwork, devInfo); + return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const double *A, int lda, const double *tau, double *C, - int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const double* A, + int lda, + const double* tau, + double* C, + int ldc, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, - work, lwork, devInfo); + return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } template cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc, - int *lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const T* A, + int lda, + const T* tau, + const T* C, + int ldc, + int* lwork); template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const float *A, int lda, const float *tau, - const float *C, int ldc, int *lwork) { - return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, - C, ldc, lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const float* A, + int lda, + const float* tau, + const float* C, + int ldc, + int* lwork) +{ + return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const double *A, int lda, const double *tau, - const double *C, int ldc, int *lwork) { - return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, - C, ldc, lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const double* A, + int lda, + const double* tau, + const double* C, + int ldc, + int* lwork) +{ + return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } /** @} */ @@ -660,62 +1192,136 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes); template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes) { - return cusolverSpScsrqrBufferInfoBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, - info, internalDataInBytes, workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes) +{ + return cusolverSpScsrqrBufferInfoBatched(handle, + m, + n, + nnzA, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + batchSize, + info, + internalDataInBytes, + workspaceInBytes); } template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes) { - return cusolverSpDcsrqrBufferInfoBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, - info, internalDataInBytes, workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes) +{ + return cusolverSpDcsrqrBufferInfoBatched(handle, + m, + n, + nnzA, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + batchSize, + info, + internalDataInBytes, + workspaceInBytes); } template cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info, - void *pBuffer, cudaStream_t stream); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* b, + T* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const float *b, float *x, int batchSize, - csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* b, + float* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, - csrRowPtrA, csrColIndA, b, x, batchSize, - info, pBuffer); + return cusolverSpScsrqrsvBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const double *b, double *x, int batchSize, - csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* b, + double* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, - csrRowPtrA, csrColIndA, b, x, batchSize, - info, pBuffer); + return cusolverSpDcsrqrsvBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } /** @} */ diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh index c848ac1f4b..562a3d8991 100644 --- a/cpp/include/raft/linalg/divide.cuh +++ b/cpp/include/raft/linalg/divide.cuh @@ -33,11 +33,10 @@ namespace linalg { * @{ */ template -void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, - cudaStream_t stream) { +void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, - stream); + out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 6172618380..75e77ac0ce 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -41,26 +41,43 @@ namespace linalg { * @{ */ template -void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, - int n_cols, math_t *eig_vectors, math_t *eig_vals, - cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void eigDC(const raft::handle_t& handle, + const math_t* in, + int n_rows, + int n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; - CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, in, - n_cols, eig_vals, &lwork)); + CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + eig_vals, + &lwork)); raft::mr::device::buffer d_work(allocator, stream, lwork); raft::mr::device::buffer d_dev_info(allocator, stream, 1); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); CUDA_CHECK(cudaGetLastError()); int dev_info; @@ -90,39 +107,80 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; * @{ */ template -void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, - EigVecMemUsage memUsage, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void eigSelDC(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + int n_eig_vals, + math_t* eig_vectors, + math_t* eig_vals, + EigVecMemUsage memUsage, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; int h_meig; - CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); + CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + &lwork)); raft::mr::device::buffer d_work(allocator, stream, lwork); raft::mr::device::buffer d_dev_info(allocator, stream, 1); raft::mr::device::buffer d_eig_vectors(allocator, stream, 0); if (memUsage == OVERWRITE_INPUT) { - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } else if (memUsage == COPY_INPUT) { d_eig_vectors.resize(n_rows * n_cols, stream); raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0), - math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, - d_work.data(), lwork, d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } CUDA_CHECK(cudaGetLastError()); @@ -135,11 +193,10 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, "This usually occurs when some of the features do not vary enough."); if (memUsage == OVERWRITE_INPUT) { - raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, - stream); + raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, stream); } else if (memUsage == COPY_INPUT) { - raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors, - n_rows, n_eig_vals, stream); + raft::matrix::truncZeroOrigin( + d_eig_vectors.data(), n_rows, eig_vectors, n_rows, n_eig_vals, stream); } } @@ -160,10 +217,17 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @{ */ template -void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, - int n_cols, math_t *eig_vectors, math_t *eig_vals, - cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { - auto allocator = handle.get_device_allocator(); +void eigJacobi(const raft::handle_t& handle, + const math_t* in, + int n_rows, + int n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream, + math_t tol = 1.e-7, + int sweeps = 15) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; @@ -172,23 +236,36 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps)); int lwork; - CUSOLVER_CHECK(cusolverDnsyevj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, - eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); + CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + &lwork, + syevj_params)); raft::mr::device::buffer d_work(allocator, stream, lwork); raft::mr::device::buffer dev_info(allocator, stream, 1); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - dev_info.data(), syevj_params, stream)); + CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + dev_info.data(), + syevj_params, + stream)); int executed_sweeps; - CUSOLVER_CHECK( - cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); + CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); CUDA_CHECK(cudaGetLastError()); CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh index 1c6dee562d..097c3ac218 100644 --- a/cpp/include/raft/linalg/eltwise.cuh +++ b/cpp/include/raft/linalg/eltwise.cuh @@ -34,19 +34,17 @@ namespace linalg { * @{ */ template -void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in + scalar; }, - stream); + out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream); } template -void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in * scalar; }, - stream); + out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream); } /** @} */ @@ -62,42 +60,46 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, * @{ */ template -void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseAdd( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); } template -void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseSub( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream); } template -void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseMultiply( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream); } template -void eltwiseDivide(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivide( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream); } template -void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivideCheckZero( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, + out, + in1, + in2, + len, [] __device__(InType a, InType b) { if (b == InType(0.0)) return InType(0.0); diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index 0a4897cc0b..d5942b7446 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -43,35 +43,53 @@ namespace linalg { * @param stream cuda stream */ template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha, - math_t beta, cudaStream_t stream) { +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); - int m = n_rows_c; - int n = n_cols_c; - int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; + int m = n_rows_c; + int n = n_cols_c; + int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; int lda = trans_a == CUBLAS_OP_T ? k : m; int ldb = trans_b == CUBLAS_OP_T ? n : k; int ldc = m; - CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, - b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK( + cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, - cudaStream_t stream) { +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); - gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, - trans_b, alpha, beta, stream); + math_t beta = math_t(0); + gemm( + handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } /** - * @brief A wrapper for CUBLS GEMM function designed for handling all possible + * @brief A wrapper for CUBLS GEMM function designed for handling all possible * combinations of operand layouts. * It computes the following equation: Z = alpha . X * Y + beta . Z * @tparam T Data type of input/output matrices (float/double) @@ -90,9 +108,20 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, * @param beta scalar */ template -void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, - int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor, - cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) { +void gemm(const raft::handle_t& handle, + T* z, + T* x, + T* y, + int _M, + int _N, + int _K, + bool isZColMajor, + bool isXColMajor, + bool isYColMajor, + cudaStream_t stream, + T alpha = T(1.0), + T beta = T(0.0)) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t trans_a, trans_b; @@ -119,13 +148,13 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major // layout, trans_b needs to be CUBLAS_OP_N. trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T; - ldb = isYColMajor == true ? _K : _N; + ldb = isYColMajor == true ? _K : _N; - c = z; + c = z; ldc = _M; - M = _M; - N = _N; - K = _K; + M = _M; + N = _N; + K = _K; } else { // Result c is required in row major layout Thus we pick // a = y, b = x and c = a * b = y * x @@ -154,7 +183,7 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, // Set leading dimension appropriately ldb = isXColMajor == true ? _M : _K; - c = z; + c = z; ldc = _N; M = _N; @@ -162,8 +191,8 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, K = _K; } // Actual cuBLAS call - CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, - b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK( + cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } } // end namespace linalg diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h index edd18b3bee..a78480bb21 100644 --- a/cpp/include/raft/linalg/gemv.h +++ b/cpp/include/raft/linalg/gemv.h @@ -26,9 +26,19 @@ namespace raft { namespace linalg { template -void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols, - const math_t* x, int incx, math_t* y, int incy, bool trans_a, - math_t alpha, math_t beta, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* a, + int n_rows, + int n_cols, + const math_t* x, + int incx, + math_t* y, + int incy, + bool trans_a, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; @@ -40,33 +50,47 @@ void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols, // n - number of columns in input matrix // lda - purpose of it to have ability to operate on submatrices of matrix without copying. // If you're not think about it it's always should be equal to m - // lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform transpose + // lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform + // transpose // In Machine Learning: // m - nunmber of columns in design matrix(number of features) // n - number of rows in designed matrix (number of train examples) - int m = n_rows; - int n = n_cols; + int m = n_rows; + int n = n_cols; int lda = trans_a ? m : n; - CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, - y, incy, stream)); + CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, y, incy, stream)); } template -void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, - int n_cols_a, const math_t* x, math_t* y, bool trans_a, math_t alpha, - math_t beta, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* x, + math_t* y, + bool trans_a, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } template -void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, - int n_cols_a, const math_t* x, math_t* y, bool trans_a, - cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* x, + math_t* y, + bool trans_a, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); + math_t beta = math_t(0); gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h index cb2e8ed1ab..2086172f5d 100644 --- a/cpp/include/raft/linalg/init.h +++ b/cpp/include/raft/linalg/init.h @@ -36,7 +36,8 @@ namespace { * \param [in] stream cuda stream */ template -void range(T *out, int start, int end, cudaStream_t stream) { +void range(T* out, int start, int end, cudaStream_t stream) +{ thrust::counting_iterator first(start); thrust::counting_iterator last = first + (end - start); thrust::device_ptr ptr(out); @@ -53,7 +54,8 @@ void range(T *out, int start, int end, cudaStream_t stream) { * \param [in] stream cuda stream */ template -void range(T *out, int n, cudaStream_t stream) { +void range(T* out, int n, cudaStream_t stream) +{ range(out, 0, n, stream); } } // unnamed namespace diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index b775a1f696..39089473e3 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -16,7 +16,7 @@ #pragma once -//for cmath: +// for cmath: #define _USE_MATH_DEFINES #include @@ -40,14 +40,14 @@ using namespace linalg; namespace spectral { // curandGeneratorNormalX -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - float *outputPtr, size_t n, - float mean, float stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev) +{ return curandGenerateNormal(generator, outputPtr, n, mean, stddev); } -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - double *outputPtr, size_t n, - double mean, double stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev) +{ return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); } @@ -55,7 +55,7 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, // Helper functions // ========================================================= -/** +/** * @brief Perform Lanczos iteration * Lanczos iteration is performed on a shifted matrix A+shift*I. * @tparam index_type_t the type of data used for indexing. @@ -85,25 +85,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t *iter, index_type_t maxIter, value_type_t shift, - value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev) { +int performLanczosIteration(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t* iter, + index_type_t maxIter, + value_type_t shift, + value_type_t tol, + bool reorthogonalize, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t negOne = -1; - constexpr value_type_t zero = 0; + constexpr value_type_t zero = 0; value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); @@ -117,29 +122,28 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, + lanczosVecs_dev, n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, - stream)); + CUBLAS_CHECK(cublasdot( + cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, - beta_host, stream)); + CUBLAS_CHECK(cublasaxpy( + cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), - 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); } // ------------------------------------------------------- @@ -151,65 +155,121 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, - lanczosVecs_dev + IDX(0, *iter, n)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), - sizeof(value_type_t), cudaMemcpyDeviceToHost, + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, n, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, - alpha_host + (*iter - 1), stream)); + CUBLAS_CHECK(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, - beta_host + *iter - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaStreamSynchronize(stream)); @@ -217,7 +277,7 @@ int performLanczosIteration( return 0; } -/** +/** * @brief Find Householder transform for 3-dimensional system * Given an input vector v=[x,y,z]', this function finds a * Householder transform P such that P*v is a multiple of @@ -235,8 +295,8 @@ int performLanczosIteration( * matrix. Matrix dimensions are 3 x 3. */ template -static void findHouseholder3(value_type_t *v, value_type_t *Pv, - value_type_t *P) { +static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) +{ // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -246,8 +306,7 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, v[0] -= *Pv; // Normalize Householder vector - value_type_t normHouseholder = - std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; v[1] /= normHouseholder; @@ -261,11 +320,13 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, // Construct Householder matrix index_type_t i, j; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; - for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; + for (i = 0; i < 3; ++i) + P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) + P[IDX(i, i, 3)] += 1; } -/** +/** * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix * The Householder transform is pre-applied to the top three rows * of the matrix and post-applied to the left three columns. The @@ -277,7 +338,8 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ template -static void applyHouseholder3(const value_type_t *v, value_type_t *A) { +static void applyHouseholder3(const value_type_t* v, value_type_t* A) +{ // Loop indices index_type_t i, j; // Dot product between Householder vector and matrix row/column @@ -286,19 +348,23 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { // Pre-apply Householder transform for (j = 0; j < 4; ++j) { vDotA = 0; - for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; - for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + for (i = 0; i < 3; ++i) + vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) + A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; } // Post-apply Householder transform for (i = 0; i < 4; ++i) { vDotA = 0; - for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; - for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + for (j = 0; j < 3; ++j) + vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) + A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; } } -/** +/** * @brief Perform one step of Francis QR algorithm * Equivalent to two steps of the classical QR algorithm on a * tridiagonal matrix. @@ -319,10 +385,14 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { * @return Zero if successful. Otherwise non-zero. */ template -static int francisQRIteration(index_type_t n, value_type_t shift1, - value_type_t shift2, value_type_t *alpha, - value_type_t *beta, value_type_t *V, - value_type_t *work) { +static int francisQRIteration(index_type_t n, + value_type_t shift1, + value_type_t shift2, + value_type_t* alpha, + value_type_t* beta, + value_type_t* V, + value_type_t* work) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- @@ -352,30 +422,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, - householderMatrix); + findHouseholder3(householder, &temp, householderMatrix); // Apply initial Householder transform to create bulge memset(bulge, 0, 16 * sizeof(value_type_t)); - for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 4; ++i) + bulge[IDX(i, i, 4)] = alpha[i]; for (i = 0; i < 3; ++i) { bulge[IDX(i + 1, i, 4)] = beta[i]; bulge[IDX(i, i + 1, 4)] = beta[i]; } applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, - 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); memcpy(V, work, 3 * n * sizeof(value_type_t)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; + alpha[pos] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = beta[pos + 3]; @@ -385,22 +455,22 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, - householderMatrix); + findHouseholder3(householder, beta + pos, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), - n, householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); } // Apply penultimate Householder transform // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; + alpha[n - 4] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = 0; @@ -408,37 +478,36 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, - householderMatrix); + findHouseholder3(householder, beta + n - 4, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); // Apply final Householder transform // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; + alpha[n - 3] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = 0; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, - householderMatrix); + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; return 0; } -/** +/** * @brief Perform implicit restart of Lanczos algorithm * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. * @tparam index_type_t the type of data used for indexing. @@ -474,23 +543,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, * @return error flag. */ template -static int lanczosRestart( - handle_t const &handle, index_type_t n, index_type_t iter, - index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, bool smallest_eig) { +static int lanczosRestart(handle_t const& handle, + index_type_t n, + index_type_t iter, + index_type_t iter_new, + value_type_t* shiftUpper, + value_type_t* shiftLower, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ V_host, + value_type_t* __restrict__ work_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + bool smallest_eig) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants constexpr value_type_t zero = 0; - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Loop index index_type_t i; @@ -501,12 +577,12 @@ static int lanczosRestart( index_type_t restartSteps = iter - iter_new; // Ritz values from Lanczos method - value_type_t *ritzVals_host = work_host + 3 * iter; + value_type_t* ritzVals_host = work_host + 3 * iter; // Shifts for implicit restart - value_type_t *shifts_host; + value_type_t* shifts_host; // Orthonormal matrix for similarity transform - value_type_t *V_dev = work_dev + n * iter; + value_type_t* V_dev = work_dev + n * iter; // ------------------------------------------------------- // Implementation @@ -524,7 +600,8 @@ static int lanczosRestart( // Initialize similarity transform with identity matrix memset(V_host, 0, iter * iter * sizeof(value_type_t)); - for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1; + for (i = 0; i < iter; ++i) + V_host[IDX(i, i, iter)] = 1; // Determine interval to suppress eigenvalues if (smallest_eig) { @@ -548,49 +625,71 @@ static int lanczosRestart( // Calculate Chebyshev nodes as shifts shifts_host = ritzVals_host; for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = - cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } // Apply Francis QR algorithm to implicitly restart Lanczos for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host, - beta_host, V_host, work_host)) + if (francisQRIteration( + iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); - - beta_host[iter - 1] = - beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev, - n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), 1, stream)); + CUDA_TRY(cudaMemcpyAsync( + V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1, + stream)); // Obtain new Lanczos vectors - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter, - &one, lanczosVecs_dev, n, V_dev, iter, &zero, - work_dev, n, stream)); - - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + iter_new, + iter, + &one, + lanczosVecs_dev, + n, + V_dev, + iter, + &zero, + work_dev, + n, + stream)); + + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, + work_dev, n * iter_new * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, - beta_host + iter_new - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); auto h_beta = 1 / beta_host[iter_new - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, - lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } @@ -601,7 +700,7 @@ static int lanczosRestart( // Eigensolver // ========================================================= -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -651,19 +750,28 @@ static int lanczosRestart( * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *shift, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* shift, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -683,21 +791,20 @@ int computeSmallestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -710,12 +817,11 @@ int computeSmallestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -738,10 +844,18 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - *shift = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + *shift = 0; + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0.0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue @@ -756,9 +870,17 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -775,9 +897,19 @@ int computeSmallestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + true); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -786,9 +918,17 @@ int computeSmallestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -799,39 +939,59 @@ int computeSmallestEigenvectors( } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; - for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) + work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter), + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter), nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); - CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, + CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -869,20 +1029,25 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 1234567) +{ using namespace spectral; // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -892,8 +1057,8 @@ int computeSmallestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); @@ -901,10 +1066,23 @@ int computeSmallestEigenvectors( // Perform Lanczos method index_type_t effIter; value_type_t shift; - int status = computeSmallestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = computeSmallestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; @@ -914,7 +1092,7 @@ int computeSmallestEigenvectors( // Eigensolver // ========================================================= -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -959,19 +1137,27 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -987,8 +1173,8 @@ int computeLargestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled @@ -998,15 +1184,14 @@ int computeLargestEigenvectors( // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -1019,12 +1204,11 @@ int computeLargestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1044,13 +1228,21 @@ int computeLargestEigenvectors( CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos - *effIter = 0; + *effIter = 0; value_type_t shift_val = 0.0; - value_type_t *shift = &shift_val; - - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + value_type_t* shift = &shift_val; + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -1067,9 +1259,19 @@ int computeLargestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + false); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -1078,9 +1280,17 @@ int computeLargestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -1090,15 +1300,18 @@ int computeLargestEigenvectors( WARNING("implicitly restarted Lanczos failed to converge"); } for (int i = 0; i < restartIter; ++i) { - for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; + for (int j = 0; j < restartIter; ++j) + Z_host[i * restartIter + j] = 0; } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // note: We need to pick the top nEigVecs eigenvalues @@ -1123,36 +1336,52 @@ int computeLargestEigenvectors( //} // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; for (i = 0; i < top_eigenparis_idx_offset; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory // skip smallest eigenvalue if needed - CUDA_TRY(cudaMemcpyAsync( - eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); // skip smallest eigenvector if needed CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -1190,18 +1419,23 @@ int computeLargestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 123456) +{ // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -1211,18 +1445,30 @@ int computeLargestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method index_type_t effIter; - int status = computeLargestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = computeLargestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index aff08da2d3..200818fdc3 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -24,21 +24,18 @@ namespace raft { namespace linalg { -template -__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in, - Args... args) { +template +__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args) +{ auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - out[idx] = map(in[idx], args[idx]...); - } + if (idx < len) { out[idx] = map(in[idx], args[idx]...); } } -template -void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { +template +void mapImpl( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ const int nblks = raft::ceildiv(len, (size_t)TPB); mapKernel <<>>(out, len, map, in, args...); @@ -60,12 +57,14 @@ void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void map(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { - mapImpl(out, len, map, stream, in, - args...); +void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + mapImpl(out, len, map, stream, in, args...); } } // namespace linalg diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh index f2f198670a..78a7017c5c 100644 --- a/cpp/include/raft/linalg/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/map_then_reduce.cuh @@ -24,50 +24,66 @@ namespace raft { namespace linalg { -struct sum_tag {}; +struct sum_tag { +}; template -__device__ void reduce(OutType *out, const InType acc, sum_tag) { +__device__ void reduce(OutType* out, const InType acc, sum_tag) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Sum(acc); - if (threadIdx.x == 0) { - raft::myAtomicAdd(out, tmp); - } + if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); } } template -__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) { +__device__ void reduce(OutType* out, const InType acc, ReduceLambda op) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Reduce(acc, op); - if (threadIdx.x == 0) { - raft::myAtomicReduce(out, tmp, op); - } + if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); } } -template -__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral, - MapOp map, ReduceLambda op, - const InType *in, Args... args) { +template +__global__ void mapThenReduceKernel(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + const InType* in, + Args... args) +{ OutType acc = neutral; - auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - acc = map(in[idx], args[idx]...); - } + if (idx < len) { acc = map(in[idx], args[idx]...); } __syncthreads(); reduce(out, acc, op); } -template -void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduceImpl(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ raft::update_device(out, &neutral, 1, stream); const int nblks = raft::ceildiv(len, (size_t)TPB); mapThenReduceKernel @@ -89,10 +105,14 @@ void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, * @param args additional input arrays */ -template -void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { +void mapThenSumReduce( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ mapThenReduceImpl( out, len, (OutType)0, map, sum_tag(), stream, in, args...); } @@ -115,11 +135,21 @@ void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduce(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ mapThenReduceImpl( out, len, neutral, map, op, stream, in, args...); } diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 902816418f..98b5eaa809 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -23,10 +23,15 @@ namespace raft { namespace linalg { template -__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, - Lambda op) { +__global__ void matrixVectorOpKernel(Type* out, + const Type* matrix, + const Type* vector, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op) +{ typedef TxN_t VecType; IdxType len = N * D; IdxType idx = threadIdx.x; @@ -57,17 +62,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - IdxType len = N * D; - IdxType nblks = - raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); +template +void matrixVectorOpImpl(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType len = N * D; + IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec, D, N, rowMajor, - bcastAlongRows, op); + <<>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -89,11 +98,18 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, - cudaStream_t stream) { +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); + size_t bytes = stride * sizeof(Type); if (16 / sizeof(Type) && bytes % 16 == 0) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); @@ -118,10 +134,16 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, ///@todo: come up with a cleaner interface to support these cases in future! template -__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector1, const Type *vector2, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op) { +__global__ void matrixVectorOpKernel(Type* out, + const Type* matrix, + const Type* vector1, + const Type* vector2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op) +{ typedef TxN_t VecType; IdxType len = N * D; IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; @@ -154,15 +176,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { +template +void matrixVectorOpImpl(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec1, vec2, D, N, rowMajor, - bcastAlongRows, op); + <<>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -185,11 +213,19 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); + size_t bytes = stride * sizeof(Type); if (16 / sizeof(Type) && bytes % 16 == 0) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh index 9d1538c172..a3fcc5bac6 100644 --- a/cpp/include/raft/linalg/mean_squared_error.cuh +++ b/cpp/include/raft/linalg/mean_squared_error.cuh @@ -24,7 +24,7 @@ namespace linalg { /** * @brief CUDA version mean squared error function mean((A-B)**2) * @tparam math_t data-type upon which the math operation will be performed - * @tparam TPB threads-per-block + * @tparam TPB threads-per-block * @param out the output mean squared error value (assumed to be a device pointer) * @param A input array (assumed to be a device pointer) * @param B input array (assumed to be a device pointer) @@ -33,14 +33,14 @@ namespace linalg { * @param stream cuda-stream where to launch this kernel */ template -void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len, - math_t weight, cudaStream_t stream) { +void meanSquaredError( + math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) +{ auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { math_t diff = a - b; return diff * diff * weight / len; }; - mapThenSumReduce(out, len, sq_diff, stream, A, - B); + mapThenSumReduce(out, len, sq_diff, stream, A, B); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh index ce948c927d..53d57ecd00 100644 --- a/cpp/include/raft/linalg/multiply.cuh +++ b/cpp/include/raft/linalg/multiply.cuh @@ -33,11 +33,10 @@ namespace linalg { * @{ */ template -void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, - cudaStream_t stream) { +void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, - stream); + out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh index 64930a7123..82558c8023 100644 --- a/cpp/include/raft/linalg/norm.cuh +++ b/cpp/include/raft/linalg/norm.cuh @@ -44,22 +44,46 @@ enum NormType { L1Norm = 0, L2Norm }; * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, - bool rowMajor, cudaStream_t stream, - Lambda fin_op = raft::Nop()) { +template > +void rowNorm(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op = raft::Nop()) +{ switch (type) { case L1Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, - raft::L1Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, - raft::L2Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; - default: - ASSERT(false, "Invalid norm type passed! [%d]", type); + default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; } @@ -77,22 +101,46 @@ void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, - bool rowMajor, cudaStream_t stream, - Lambda fin_op = raft::Nop()) { +template > +void colNorm(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op = raft::Nop()) +{ switch (type) { case L1Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, - raft::L1Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, - raft::L2Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; - default: - ASSERT(false, "Invalid norm type passed! [%d]", type); + default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; } diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index cafa8d54f1..c2455ac3a8 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -40,15 +40,19 @@ namespace linalg { * @{ */ template -void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, - int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void qrGetQ(const raft::handle_t& handle, + const math_t* M, + math_t* Q, + int n_rows, + int n_cols, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; int k = min(m, n); - CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); raft::mr::device::buffer tau(allocator, stream, k); CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); @@ -58,19 +62,16 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); raft::mr::device::buffer workspace(allocator, stream, Lwork); - CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDngeqrf( + cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); #endif - CUSOLVER_CHECK( - cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDnorgqr( + cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); } /** @@ -84,30 +85,41 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, * @param stream cuda stream */ template -void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, - int n_rows, int n_cols, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void qrGetQR(const raft::handle_t& handle, + math_t* M, + math_t* Q, + math_t* R, + int n_rows, + int n_cols, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; raft::mr::device::buffer R_full(allocator, stream, m * n); raft::mr::device::buffer tau(allocator, stream, min(m, n)); - CUDA_CHECK( - cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); + CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); int R_full_nrows = m, R_full_ncols = n; - CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Lwork; raft::mr::device::buffer devInfo(allocator, stream, 1); - CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, - R_full_ncols, R_full.data(), - R_full_nrows, &Lwork)); + CUSOLVER_CHECK(cusolverDngeqrf_bufferSize( + cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); raft::mr::device::buffer workspace(allocator, stream, Lwork); - CUSOLVER_CHECK(cusolverDngeqrf( - cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, - tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, + R_full_nrows, + R_full_ncols, + R_full.data(), + R_full_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); @@ -115,17 +127,24 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); - CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Q_nrows = m, Q_ncols = n; - CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols, - min(Q_ncols, Q_nrows), Q, Q_nrows, - tau.data(), &Lwork)); + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize( + cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr( - cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), - workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, + Q_nrows, + Q_ncols, + min(Q_ncols, Q_nrows), + Q, + Q_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); } /** @} */ diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index d39577bbdd..693a797db9 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -52,28 +52,33 @@ namespace linalg { * @param reduce_op binary reduction operation * @param final_op elementwise operation to apply before storing results */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void reduce(OutType *dots, const InType *data, int D, int N, OutType init, - bool rowMajor, bool alongRows, cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void reduce(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + bool rowMajor, + bool alongRows, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ if (rowMajor && alongRows) { - coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (rowMajor && !alongRows) { - stridedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (!rowMajor && alongRows) { - stridedReduction(dots, data, N, D, init, stream, inplace, main_op, - reduce_op, final_op); + stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } else { - coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, - reduce_op, final_op); + coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } } diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh index bba652e137..f931c976fd 100644 --- a/cpp/include/raft/linalg/strided_reduction.cuh +++ b/cpp/include/raft/linalg/strided_reduction.cuh @@ -28,14 +28,15 @@ namespace linalg { // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout template -__global__ void stridedSummationKernel(Type *dots, const Type *data, int D, - int N, Type init, MainLambda main_op) { +__global__ void stridedSummationKernel( + Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op) +{ // Thread reduction Type thread_data = Type(init); - int colStart = blockIdx.x * blockDim.x + threadIdx.x; + int colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { int rowStart = blockIdx.y * blockDim.y + threadIdx.y; - int stride = blockDim.y * gridDim.y; + int stride = blockDim.y * gridDim.y; for (int j = rowStart; j < N; j += stride) { int idx = colStart + j * D; thread_data += main_op(data[idx], j); @@ -44,8 +45,8 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D, // Block reduction extern __shared__ char tmp[]; // One element per thread in block - Type *temp = (Type *)tmp; // Cast to desired type - int myidx = threadIdx.x + blockDim.x * threadIdx.y; + Type* temp = (Type*)tmp; // Cast to desired type + int myidx = threadIdx.x + blockDim.x * threadIdx.y; temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { @@ -54,24 +55,31 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D, } // Grid reduction - if ((colStart < D) && (threadIdx.y == 0)) - raft::myAtomicAdd(dots + colStart, temp[myidx]); + if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]); } // Kernel to perform reductions along the strided dimension // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout -template -__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, - int N, OutType init, MainLambda main_op, - ReduceLambda reduce_op) { +template +__global__ void stridedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + MainLambda main_op, + ReduceLambda reduce_op) +{ // Thread reduction OutType thread_data = init; - IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; + IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y; - IdxType stride = blockDim.y * gridDim.y; + IdxType stride = blockDim.y * gridDim.y; for (IdxType j = rowStart; j < N; j += stride) { IdxType idx = colStart + j * D; thread_data = reduce_op(thread_data, main_op(data[idx], j)); @@ -79,14 +87,13 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, } // Block reduction - extern __shared__ char tmp[]; // One element per thread in block - auto *temp = (OutType *)tmp; // Cast to desired type + extern __shared__ char tmp[]; // One element per thread in block + auto* temp = (OutType*)tmp; // Cast to desired type IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y); - temp[myidx] = thread_data; + temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { - if (threadIdx.y < j) - temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); + if (threadIdx.y < j) temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); __syncthreads(); } @@ -122,15 +129,23 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void stridedReduction(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ ///@todo: this extra should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) if (!inplace) @@ -140,7 +155,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, // Arbitrary numbers for now, probably need to tune const dim3 thrds(32, 16); IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); - elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; + elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; @@ -153,8 +168,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, <<>>(dots, data, D, N, init, main_op); else stridedReductionKernel - <<>>(dots, data, D, N, init, main_op, - reduce_op); + <<>>(dots, data, D, N, init, main_op, reduce_op); ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh index 882c105689..43060d0818 100644 --- a/cpp/include/raft/linalg/subtract.cuh +++ b/cpp/include/raft/linalg/subtract.cuh @@ -38,8 +38,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, - cudaStream_t stream) { +void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; unaryOp(out, in, len, op, stream); } @@ -58,24 +58,25 @@ void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, * @param stream cuda stream where to launch work */ template -void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len, - cudaStream_t stream) { +void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { - //TODO: kernel do not use shared memory in current implementation +__global__ void subtract_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ + // TODO: kernel do not use shared memory in current implementation int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] - *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and + * write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -86,9 +87,12 @@ __global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, * @remark block size has not been tuned */ template -void subtractDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void subtractDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // Just for the note - there is no way to express such operation with cuBLAS in effective way // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index 7357a68a4c..1cb8b7592f 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -50,14 +50,21 @@ namespace linalg { // TODO: couldn't template this function due to cusolverDnSgesvd and // cusolverSnSgesvd. Check if there is any other way. template -void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, - T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, - bool trans_right, bool gen_left_vec, bool gen_right_vec, - cudaStream_t stream) { - std::shared_ptr allocator = - handle.get_device_allocator(); - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); +void svdQR(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* sing_vals, + T* left_sing_vecs, + T* right_sing_vecs, + bool trans_right, + bool gen_left_vec, + bool gen_right_vec, + cudaStream_t stream) +{ + std::shared_ptr allocator = handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000 // 46340: sqrt of max int value @@ -72,14 +79,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, const int n = n_cols; raft::mr::device::buffer devInfo(allocator, stream, 1); - T *d_rwork = nullptr; + T* d_rwork = nullptr; int lwork = 0; - CUSOLVER_CHECK( - cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); + CUSOLVER_CHECK(cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); raft::mr::device::buffer d_work(allocator, stream, lwork); - char jobu = 'S'; + char jobu = 'S'; char jobvt = 'A'; if (!gen_left_vec) { @@ -92,9 +98,23 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, strcpy(&jobvt, &new_vt); } - CUSOLVER_CHECK(cusolverDngesvd( - cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m, - right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDngesvd(cusolverH, + jobu, + jobvt, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + d_rwork, + devInfo.data(), + stream)); // Transpose the right singular vector back if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream); @@ -110,19 +130,37 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, } template -void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, - T *U, T *V, bool gen_left_vec, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void svdEig(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* S, + T* U, + T* V, + bool gen_left_vec, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); int len = n_cols * n_cols; raft::mr::device::buffer in_cross_mult(allocator, stream, len); T alpha = T(1); - T beta = T(0); - raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(), - n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, + T beta = T(0); + raft::linalg::gemm(handle, + in, + n_rows, + n_cols, + in, + in_cross_mult.data(), + n_cols, + n_cols, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, stream); eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); @@ -133,10 +171,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true); if (gen_left_vec) { - raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols, - CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); - raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, - true, stream); + raft::linalg::gemm(handle, + in, + n_rows, + n_cols, + V, + U, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, true, stream); } } @@ -158,11 +206,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, * @param stream cuda stream */ template -void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - math_t *sing_vals, math_t *left_sing_vecs, - math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, - math_t tol, int max_sweeps, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +void svdJacobi(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + math_t* sing_vals, + math_t* left_sing_vecs, + math_t* right_sing_vecs, + bool gen_left_vec, + bool gen_right_vec, + math_t tol, + int max_sweeps, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); gesvdjInfo_t gesvdj_params = NULL; @@ -177,18 +234,42 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, raft::mr::device::buffer devInfo(allocator, stream, 1); int lwork = 0; - int econ = 1; - - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); + int econ = 1; + + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + &lwork, + gesvdj_params)); raft::mr::device::buffer d_work(allocator, stream, lwork); - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(), - gesvdj_params, stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + devInfo.data(), + gesvdj_params, + stream)); CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); } @@ -207,18 +288,36 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @param stream cuda stream */ template -void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, - math_t *V, math_t *out, int n_rows, int n_cols, int k, - cudaStream_t stream) { +void svdReconstruction(const raft::handle_t& handle, + math_t* U, + math_t* S, + math_t* V, + math_t* out, + int n_rows, + int n_cols, + int k, + cudaStream_t stream) +{ auto allocator = handle.get_device_allocator(); const math_t alpha = 1.0, beta = 0.0; raft::mr::device::buffer SVT(allocator, stream, k * n_cols); - raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, - CUBLAS_OP_T, alpha, beta, stream); - raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols, - CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + raft::linalg::gemm( + handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); + raft::linalg::gemm(handle, + U, + n_rows, + k, + SVT.data(), + out, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); } /** @@ -236,10 +335,18 @@ void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, * @param stream cuda stream */ template -bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, - math_t *S_vec, math_t *V, int n_rows, int n_cols, - int k, math_t tol, cudaStream_t stream) { - auto allocator = handle.get_device_allocator(); +bool evaluateSVDByL2Norm(const raft::handle_t& handle, + math_t* A_d, + math_t* U, + math_t* S_vec, + math_t* V, + int n_rows, + int n_cols, + int k, + math_t tol, + cudaStream_t stream) +{ + auto allocator = handle.get_device_allocator(); cublasHandle_t cublasH = handle.get_cublas_handle(); int m = n_rows, n = n_cols; @@ -263,16 +370,25 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, // calculate percent error const math_t alpha = 1.0, beta = -1.0; raft::mr::device::buffer A_minus_P(allocator, stream, m * n); - CUDA_CHECK( - cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); - - CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, - &alpha, A_d, m, &beta, P_d.data(), m, - A_minus_P.data(), m, stream)); - - math_t norm_A_minus_P = - raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); - math_t percent_error = 100.0 * norm_A_minus_P / normA; + CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); + + CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, + CUBLAS_OP_N, + CUBLAS_OP_N, + m, + n, + &alpha, + A_d, + m, + &beta, + P_d.data(), + m, + A_minus_P.data(), + m, + stream)); + + math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); + math_t percent_error = 100.0 * norm_A_minus_P / normA; return (percent_error / 100.0 < tol); } diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h index d90f6271fa..9b954c29c1 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.h @@ -33,18 +33,34 @@ namespace linalg { * @param stream: cuda stream */ template -void transpose(const raft::handle_t &handle, math_t *in, math_t *out, - int n_rows, int n_cols, cudaStream_t stream) { +void transpose(const raft::handle_t& handle, + math_t* in, + math_t* out, + int n_rows, + int n_cols, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); int out_n_rows = n_cols; int out_n_cols = n_rows; const math_t alpha = 1.0; - const math_t beta = 0.0; - CUBLAS_CHECK(raft::linalg::cublasgeam( - cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in, - n_rows, &beta, out, out_n_rows, out, out_n_rows, stream)); + const math_t beta = 0.0; + CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_n_rows, + out_n_cols, + &alpha, + in, + n_rows, + &beta, + out, + out_n_rows, + out, + out_n_rows, + stream)); } /** @@ -54,24 +70,25 @@ void transpose(const raft::handle_t &handle, math_t *in, math_t *out, * @param stream: cuda stream */ template -void transpose(math_t *inout, int n, cudaStream_t stream) { - auto m = n; - auto size = n * n; - auto d_inout = inout; +void transpose(math_t* inout, int n, cudaStream_t stream) +{ + auto m = n; + auto size = n * n; + auto d_inout = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, - [=] __device__(int idx) { - int s_row = idx % m; - int s_col = idx / m; - int d_row = s_col; - int d_col = s_row; - if (s_row < s_col) { - auto temp = d_inout[d_col * m + d_row]; - d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; - d_inout[s_col * m + s_row] = temp; - } - }); + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(int idx) { + int s_row = idx % m; + int s_col = idx / m; + int d_row = s_col; + int d_col = s_row; + if (s_row < s_col) { + auto temp = d_inout[d_col * m + d_row]; + d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; + d_inout[s_col * m + s_row] = temp; + } + }); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index 46b4d296cb..198b9b2b10 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -23,10 +23,9 @@ namespace raft { namespace linalg { -template -__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, - Lambda op) { +template +__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op) +{ typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a; @@ -42,12 +41,10 @@ __global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, b.store(out, idx); } -template -void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); unaryOpKernel <<>>(out, in, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -68,47 +65,38 @@ void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val);` */ -template -void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - if (len <= 0) return; //silently skip in case of 0 length input - constexpr auto maxSize = - sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t inAddr = uint64_t(in); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && - outAddr % 16 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && - outAddr % 8 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && - outAddr % 4 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && - outAddr % 2 == 0) { - unaryOpImpl( - out, in, len, op, stream); +template +void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + if (len <= 0) return; // silently skip in case of 0 length input + constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t inAddr = uint64_t(in); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { + unaryOpImpl(out, in, len, op, stream); } else if (1 / maxSize) { - unaryOpImpl( - out, in, len, op, stream); + unaryOpImpl(out, in, len, op, stream); } else { - unaryOpImpl(out, in, len, op, - stream); + unaryOpImpl(out, in, len, op, stream); } } template -__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { +__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - if (idx < len) { - op(out + idx, idx); - } + if (idx < len) { op(out + idx, idx); } } /** @@ -128,14 +116,12 @@ __global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { * where outLocationOffset will be out + idx. * @param[in] stream cuda stream where to launch work */ -template -void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op, - cudaStream_t stream) { +template +void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +{ if (len <= 0) return; // silently skip in case of 0 length input auto nblks = raft::ceildiv(len, TPB); - writeOnlyUnaryOpKernel - <<>>(out, len, op); + writeOnlyUnaryOpKernel<<>>(out, len, op); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh index 0a72117140..579491b5cc 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.cuh @@ -41,14 +41,18 @@ namespace matrix { * @param stream cuda stream */ template -void power(math_t *in, math_t *out, math_t scalar, int len, - cudaStream_t stream) { - auto d_src = in; +void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream) +{ + auto d_src = in; auto d_dest = out; raft::linalg::binaryOp( - d_dest, d_src, d_src, len, - [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream); + d_dest, + d_src, + d_src, + len, + [=] __device__(math_t a, math_t b) { return scalar * a * b; }, + stream); } /** @@ -59,7 +63,8 @@ void power(math_t *in, math_t *out, math_t scalar, int len, * @param stream cuda stream */ template -void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { +void power(math_t* inout, math_t scalar, int len, cudaStream_t stream) +{ power(inout, inout, scalar, len, stream); } @@ -70,7 +75,8 @@ void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { * @param stream cuda stream */ template -void power(math_t *inout, int len, cudaStream_t stream) { +void power(math_t* inout, int len, cudaStream_t stream) +{ math_t scalar = 1.0; power(inout, scalar, len, stream); } @@ -84,7 +90,8 @@ void power(math_t *inout, int len, cudaStream_t stream) { * @{ */ template -void power(math_t *in, math_t *out, int len, cudaStream_t stream) { +void power(math_t* in, math_t* out, int len, cudaStream_t stream) +{ math_t scalar = 1.0; power(in, out, scalar, len, stream); } @@ -101,13 +108,20 @@ void power(math_t *in, math_t *out, int len, cudaStream_t stream) { * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, - cudaStream_t stream, bool set_neg_zero = false) { - auto d_src = in; +void seqRoot(math_t* in, + math_t* out, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool set_neg_zero = false) +{ + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, d_src, len, + d_dest, + d_src, + len, [=] __device__(math_t a) { if (set_neg_zero) { if (a < math_t(0)) { @@ -133,8 +147,9 @@ void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, - bool set_neg_zero = false) { +void seqRoot( + math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false) +{ seqRoot(inout, inout, scalar, len, stream, set_neg_zero); } @@ -148,22 +163,27 @@ void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, * @param stream cuda stream */ template -void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { +void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; seqRoot(in, out, scalar, len, stream); } template -void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) { +void seqRoot(math_t* inout, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; seqRoot(inout, inout, scalar, len, stream); } template -void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, - cudaStream_t stream, math_t thres = 1e-15) { +void setSmallValuesZero( + math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ raft::linalg::unaryOp( - out, in, len, + out, + in, + len, [=] __device__(math_t a) { if (a <= thres && -a <= thres) { return math_t(0); @@ -184,8 +204,8 @@ void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, * @param thres: threshold */ template -void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, - math_t thres = 1e-15) { +void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ setSmallValuesZero(inout, inout, len, stream, thres); } @@ -203,14 +223,21 @@ void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, * @{ */ template -void reciprocal(math_t *in, math_t *out, math_t scalar, int len, - cudaStream_t stream, bool setzero = false, - math_t thres = 1e-15) { - auto d_src = in; +void reciprocal(math_t* in, + math_t* out, + math_t scalar, + int len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, d_src, len, + d_dest, + d_src, + len, [=] __device__(math_t a) { if (setzero) { if (abs(a) <= thres) { @@ -237,8 +264,13 @@ void reciprocal(math_t *in, math_t *out, math_t scalar, int len, * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0) */ template -void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, - bool setzero = false, math_t thres = 1e-15) { +void reciprocal(math_t* inout, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ reciprocal(inout, inout, scalar, len, stream, setzero, thres); } @@ -251,7 +283,8 @@ void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, * @param stream cuda stream */ template -void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { +void reciprocal(math_t* inout, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; reciprocal(inout, scalar, len, stream); } @@ -266,14 +299,15 @@ void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { * @param stream cuda stream */ template -void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { +void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; reciprocal(in, out, scalar, len, stream); } template -void setValue(math_t *out, const math_t *in, math_t scalar, int len, - cudaStream_t stream = 0) { +void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0) +{ raft::linalg::unaryOp( out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream); } @@ -289,46 +323,44 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len, * @param stream cuda stream */ template -void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, - cudaStream_t stream) { - auto d_src = src; +void ratio( + const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream) +{ + auto d_src = src; auto d_dest = dest; - std::shared_ptr allocator = - handle.get_device_allocator(); + std::shared_ptr allocator = handle.get_device_allocator(); raft::mr::device::buffer d_sum(allocator, stream, 1); - auto *d_sum_ptr = d_sum.data(); - auto no_op = [] __device__(math_t in) { return in; }; + auto* d_sum_ptr = d_sum.data(); + auto no_op = [] __device__(math_t in) { return in; }; raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); raft::linalg::unaryOp( - d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, - stream); + d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream); } /** @} */ // Computes the argmax(d_in) column-wise in a DxN matrix template -__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { +__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax) +{ typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(-1, -raft::myInf()); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx])); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); - if (threadIdx.x == 0) { - argmax[blockIdx.x] = maxKV.key; - } + if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; } } /** @@ -340,8 +372,8 @@ __global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { * @param stream: cuda stream */ template -void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, - cudaStream_t stream) { +void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream) +{ int D = n_rows; int N = n_cols; if (D <= 32) { @@ -360,30 +392,29 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by // flipping the sign if the |max| value for each column is negative. template -__global__ void signFlipKernel(T *d_in, int D, int N) { +__global__ void signFlipKernel(T* d_in, int D, int N) +{ typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax (with abs()) index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(0, 0); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx]))); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); // flip column sign if d_in[maxIndex] < 0 __shared__ bool need_sign_flip; - if (threadIdx.x == 0) { - need_sign_flip = d_in[maxKV.key] < T(0); - } + if (threadIdx.x == 0) { need_sign_flip = d_in[maxKV.key] < T(0); } __syncthreads(); if (need_sign_flip) { for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; d_in[idx] = -d_in[idx]; } } @@ -398,9 +429,10 @@ __global__ void signFlipKernel(T *d_in, int D, int N) { * @param stream cuda stream */ template -void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { - int D = n_rows; - int N = n_cols; +void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) +{ + int D = n_rows; + int N = n_cols; auto data = inout; if (D <= 32) { signFlipKernel<<>>(data, D, N); @@ -415,20 +447,43 @@ void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { } template -void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryMult(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a * b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a * b; }, + stream); } template -void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, - bool bcastAlongRows, cudaStream_t stream) { +void matrixVectorBinaryMultSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (b == Type(0)) return a; @@ -439,22 +494,45 @@ void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, } template -void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryDiv(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a / b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a / b; }, + stream); } template -void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, - bool bcastAlongRows, cudaStream_t stream, - bool return_zero = false) { +void matrixVectorBinaryDivSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream, + bool return_zero = false) +{ if (return_zero) { raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return Type(0); @@ -464,7 +542,13 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, stream); } else { raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return a; @@ -476,21 +560,45 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, } template -void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryAdd(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); } template -void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinarySub(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); } }; // end namespace matrix diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh index 5f5755e24e..71a2888545 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.cuh @@ -49,29 +49,33 @@ using namespace std; * @param rowMajor whether the matrix has row major layout */ template -void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, - const idx_array_t *indices, idx_t n_rows_indices, - cudaStream_t stream, bool rowMajor = false) { +void copyRows(const m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + const idx_array_t* indices, + idx_t n_rows_indices, + cudaStream_t stream, + bool rowMajor = false) +{ if (rowMajor) { const idx_t TPB = 256; - cache:: - get_vecs<<>>( - in, n_cols, indices, n_rows_indices, out); + cache::get_vecs<<>>( + in, n_cols, indices, n_rows_indices, out); CUDA_CHECK(cudaPeekAtLastError()); return; } - idx_t size = n_rows_indices * n_cols; + idx_t size = n_rows_indices * n_cols; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, - [=] __device__(idx_t idx) { - idx_t row = idx % n_rows_indices; - idx_t col = idx / n_rows_indices; + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) { + idx_t row = idx % n_rows_indices; + idx_t col = idx / n_rows_indices; - out[col * n_rows_indices + row] = - in[col * n_rows + indices[row]]; - }); + out[col * n_rows_indices + row] = in[col * n_rows + indices[row]]; + }); } /** @@ -83,8 +87,8 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, * @param stream: cuda stream */ template -void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ raft::copy_async(out, in, n_rows * n_cols, stream); } @@ -99,21 +103,22 @@ void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols, * @param stream: cuda stream */ template -void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, - idx_t out_n_cols, cudaStream_t stream) { - auto m = out_n_rows; - auto k = in_n_rows; - idx_t size = out_n_rows * out_n_cols; - auto d_q = in; +void truncZeroOrigin( + m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) +{ + auto m = out_n_rows; + auto k = in_n_rows; + idx_t size = out_n_rows * out_n_cols; + auto d_q = in; auto d_q_trunc = out; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, - [=] __device__(idx_t idx) { - idx_t row = idx % m; - idx_t col = idx / m; - d_q_trunc[col * m + row] = d_q[col * k + row]; - }); + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) { + idx_t row = idx % m; + idx_t col = idx / m; + d_q_trunc[col * m + row] = d_q[col * k + row]; + }); } /** @@ -125,24 +130,25 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, * @param stream: cuda stream */ template -void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto n = n_cols; - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto n = n_cols; + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = dest_row; - idx_t src_col = (n - dest_col) - 1; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = dest_row; + idx_t src_col = (n - dest_col) - 1; + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -154,25 +160,26 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { * @param stream: cuda stream */ template -void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(thrust::cuda::par.on(stream), counting, - counting + (size / 2), [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = (m - dest_row) - 1; - ; - idx_t src_col = dest_col; + thrust::for_each( + thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = (m - dest_row) - 1; + ; + idx_t src_col = dest_col; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -184,16 +191,16 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { * @param v_separator: vertical separator character */ template -void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', - char v_separator = '\n') { +void print( + const m_t* in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', char v_separator = '\n') +{ std::vector h_matrix = std::vector(n_cols * n_rows); - CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), - cudaMemcpyDeviceToHost)); + CUDA_CHECK( + cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), cudaMemcpyDeviceToHost)); for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { - printf("%1.4f%c", h_matrix[j * n_rows + i], - j < n_cols - 1 ? h_separator : v_separator); + printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator); } } } @@ -205,7 +212,8 @@ void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', * @param n_cols: number of columns of input matrix */ template -void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { +void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) +{ for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { printf("%1.4f ", in[j * n_rows + i]); @@ -226,8 +234,9 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { * (1-based) */ template -__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, - idx_t y1, idx_t x2, idx_t y2) { +__global__ void slice( + m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t dm = x2 - x1, dn = y2 - y1; if (idx < dm * dn) { @@ -251,8 +260,16 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, * @param stream: cuda stream */ template -void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, - idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) { +void sliceMatrix(m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + idx_t x1, + idx_t y1, + idx_t x2, + idx_t y2, + cudaStream_t stream) +{ // Slicing dim3 block(64); dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x); @@ -268,15 +285,13 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, * @param k: min(n_rows, n_cols) */ template -__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, - idx_t n_cols, idx_t k) { +__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t m = n_rows, n = n_cols; if (idx < m * n) { idx_t i = idx % m, j = idx / m; - if (i < k && j < k && j >= i) { - dst[i + j * k] = src[idx]; - } + if (i < k && j < k && j >= i) { dst[i + j * k] = src[idx]; } } } @@ -289,8 +304,8 @@ __global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, * @param stream: cuda stream */ template -void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ idx_t m = n_rows, n = n_cols; idx_t k = min(m, n); dim3 block(64); @@ -307,13 +322,11 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, * @param k: dimensionality */ template -__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, - idx_t n, idx_t k) { +__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < k) { - matrix[idx + idx * m] = vec[idx]; - } + if (idx < k) { matrix[idx + idx * m] = vec[idx]; } } /** @@ -325,13 +338,13 @@ __global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, * @param stream: cuda stream */ template -void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void initializeDiagonalMatrix( + m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ idx_t k = min(n_rows, n_cols); dim3 block(64); dim3 grid((k + block.x - 1) / block.x); - copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, - n_cols, k); + copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, n_cols, k); } /** @@ -341,11 +354,10 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, * @param len: size of one side of the matrix */ template -__global__ void matrixDiagonalInverse(m_t *in, idx_t len) { +__global__ void matrixDiagonalInverse(m_t* in, idx_t len) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < len) { - in[idx + idx * len] = 1.0 / in[idx + idx * len]; - } + if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; } } /** @@ -355,7 +367,8 @@ __global__ void matrixDiagonalInverse(m_t *in, idx_t len) { * @param stream: cuda stream */ template -void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { +void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) +{ dim3 block(64); dim3 grid((len + block.x - 1) / block.x); matrixDiagonalInverse<<>>(in, len); @@ -369,12 +382,11 @@ void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { * @param stream: cuda stream */ template -m_t getL2Norm(const raft::handle_t &handle, m_t *in, idx_t size, - cudaStream_t stream) { +m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream) +{ cublasHandle_t cublasH = handle.get_cublas_handle(); - m_t normval = 0; - CUBLAS_CHECK( - raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + m_t normval = 0; + CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); return normval; } diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp index 29e0d7cfcd..18c8be5f45 100644 --- a/cpp/include/raft/mr/buffer_base.hpp +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -35,11 +35,11 @@ namespace mr { template class buffer_base { public: - using size_type = std::size_t; - using value_type = T; - using iterator = value_type*; - using const_iterator = const value_type*; - using reference = T&; + using size_type = std::size_t; + using value_type = T; + using iterator = value_type*; + using const_iterator = const value_type*; + using reference = T&; using const_reference = const T&; buffer_base() = delete; @@ -55,16 +55,12 @@ class buffer_base { * @param[in] stream cuda stream where this allocation operations are async * @param[in] n size of the buffer (in number of elements) */ - buffer_base(std::shared_ptr allocator, cudaStream_t stream, - size_type n = 0) - : data_(nullptr), - size_(n), - capacity_(n), - stream_(stream), - allocator_(std::move(allocator)) { + buffer_base(std::shared_ptr allocator, cudaStream_t stream, size_type n = 0) + : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator)) + { if (capacity_ > 0) { - data_ = static_cast( - allocator_->allocate(capacity_ * sizeof(value_type), stream_)); + data_ = + static_cast(allocator_->allocate(capacity_ * sizeof(value_type), stream_)); CUDA_CHECK(cudaStreamSynchronize(stream_)); } } @@ -98,23 +94,23 @@ class buffer_base { * @param[in] stream cuda stream where allocation operations are queued * @{ */ - void reserve(size_type new_capacity) { + void reserve(size_type new_capacity) + { if (new_capacity > capacity_) { - auto* new_data = static_cast( - allocator_->allocate(new_capacity * sizeof(value_type), stream_)); - if (size_ > 0) { - raft::copy(new_data, data_, size_, stream_); - } + auto* new_data = + static_cast(allocator_->allocate(new_capacity * sizeof(value_type), stream_)); + if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); } // Only deallocate if we have allocated a pointer if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = new_data; + data_ = new_data; capacity_ = new_capacity; } } - void reserve(size_type new_capacity, cudaStream_t stream) { + void reserve(size_type new_capacity, cudaStream_t stream) + { set_stream(stream); reserve(new_capacity); } @@ -127,12 +123,14 @@ class buffer_base { * @param[in] stream cuda stream where the work will be queued * @{ */ - void resize(const size_type new_size) { + void resize(const size_type new_size) + { reserve(new_size); size_ = new_size; } - void resize(const size_type new_size, cudaStream_t stream) { + void resize(const size_type new_size, cudaStream_t stream) + { set_stream(stream); resize(new_size); } @@ -146,16 +144,18 @@ class buffer_base { * @param[in] stream cuda stream where the work will be queued * @{ */ - void release() { + void release() + { if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = nullptr; + data_ = nullptr; capacity_ = 0; - size_ = 0; + size_ = 0; } - void release(cudaStream_t stream) { + void release(cudaStream_t stream) + { set_stream(stream); release(); } @@ -195,7 +195,8 @@ class buffer_base { * @param[in] stream new cuda stream to be set. If it is the same as the * current one, then this method will be a no-op. */ - void set_stream(cudaStream_t stream) { + void set_stream(cudaStream_t stream) + { if (stream_ != stream) { cudaEvent_t event; CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp index 889e1640db..e930b617e0 100644 --- a/cpp/include/raft/mr/device/allocator.hpp +++ b/cpp/include/raft/mr/device/allocator.hpp @@ -32,17 +32,20 @@ namespace device { * further to the ones listed in `Allocator`: * - Allocations may be always on the device that was specified on construction. */ -class allocator : public base_allocator {}; +class allocator : public base_allocator { +}; /** Default device allocator based on the one provided by RMM */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override { + void* allocate(std::size_t n, cudaStream_t stream) override + { void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { + void deallocate(void* p, std::size_t n, cudaStream_t stream) override + { rmm::mr::get_current_device_resource()->deallocate(p, n, stream); } }; // class default_allocator diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp index 39b5674ce4..2b9d84368f 100644 --- a/cpp/include/raft/mr/device/buffer.hpp +++ b/cpp/include/raft/mr/device/buffer.hpp @@ -46,11 +46,11 @@ namespace device { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -60,7 +60,9 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} + : buffer_base(alloc, stream, n) + { + } }; // class buffer }; // namespace device diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp index 8af266d4f0..62b6826211 100644 --- a/cpp/include/raft/mr/host/allocator.hpp +++ b/cpp/include/raft/mr/host/allocator.hpp @@ -34,20 +34,23 @@ namespace host { * further to the ones listed in `Allocator`: * - Allocations don't need to be zero copy accessible form a device. */ -class allocator : public base_allocator {}; +class allocator : public base_allocator { +}; /** Default cudaMallocHost/cudaFreeHost based host allocator */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override { + void* allocate(std::size_t n, cudaStream_t stream) override + { void* ptr = nullptr; CUDA_CHECK(cudaMallocHost(&ptr, n)); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { - //Must call _NO_THROW here since this is called frequently from object - //destructors which are "nothrow" by default + void deallocate(void* p, std::size_t n, cudaStream_t stream) override + { + // Must call _NO_THROW here since this is called frequently from object + // destructors which are "nothrow" by default CUDA_CHECK_NO_THROW(cudaFreeHost(p)); } }; // class default_allocator diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp index 3c505bf2ed..52475ad6ec 100644 --- a/cpp/include/raft/mr/host/buffer.hpp +++ b/cpp/include/raft/mr/host/buffer.hpp @@ -48,11 +48,11 @@ namespace host { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -62,14 +62,15 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, const device::buffer& other) - : buffer_base(alloc, other.get_stream(), other.size()) { - if (other.size() > 0) { - raft::copy(data_, other.data(), other.size(), other.get_stream()); - } + : buffer_base(alloc, other.get_stream(), other.size()) + { + if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); } } buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} + : buffer_base(alloc, stream, n) + { + } reference operator[](size_type pos) { return data_[pos]; } diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh index 56710ea81f..5267770e8a 100644 --- a/cpp/include/raft/random/rng.cuh +++ b/cpp/include/raft/random/rng.cuh @@ -43,10 +43,9 @@ enum GeneratorType { GenKiss99 }; -template -__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, - LenType len, Lambda randOp) { +template +__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp) +{ LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; detail::Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -58,10 +57,10 @@ __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, } // used for Box-Muller type transformations -template -__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, - LenType len, Lambda2 rand2Op) { +template +__global__ void rand2Kernel( + uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op) +{ LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; detail::Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -77,8 +76,9 @@ __global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, } template -__global__ void constFillKernel(Type *ptr, int len, Type val) { - unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; +__global__ void constFillKernel(Type* ptr, int len, Type val) +{ + unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; const unsigned stride = gridDim.x * blockDim.x; for (unsigned idx = tid; idx < len; idx += stride) { ptr[idx] = val; @@ -99,19 +99,20 @@ __global__ void constFillKernel(Type *ptr, int len, Type val) { * @{ */ template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, - Type sigma2, Type mu2) { - constexpr Type twoPi = Type(2.0) * Type(3.141592654); +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2) +{ + constexpr Type twoPi = Type(2.0) * Type(3.141592654); constexpr Type minus2 = -Type(2.0); - Type R = raft::mySqrt(minus2 * raft::myLog(val1)); - Type theta = twoPi * val2; + Type R = raft::mySqrt(minus2 * raft::myLog(val1)); + Type theta = twoPi * val2; Type s, c; raft::mySinCos(theta, s, c); val1 = R * c * sigma1 + mu1; val2 = R * s * sigma2 + mu2; } template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1) +{ box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu1); } /** @} */ @@ -131,7 +132,8 @@ class Rng { // simple heuristic to make sure all SMs will be occupied properly // and also not too many initialization calls will be made by each thread nBlocks(4 * getMultiProcessorCount()), - gen() { + gen() + { seed(_s); } @@ -142,7 +144,8 @@ class Rng { * function of timestamp. Another example is to use the c++11's * `std::random_device` for setting seed. */ - void seed(uint64_t _s) { + void seed(uint64_t _s) + { gen.seed(_s); offset = 0; } @@ -158,7 +161,8 @@ class Rng { * @param[out] b intercept parameter */ template - void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { + void affine_transform_params(IdxT n, IdxT& a, IdxT& b) + { // always keep 'a' to be coprime to 'n' a = gen() % n; while (gcd(a, n) != 1) { @@ -181,27 +185,24 @@ class Rng { * @{ */ template - void uniform(Type *ptr, LenType len, Type start, Type end, - cudaStream_t stream) { + void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'uniform' can only be floating point!"); custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return (val * (end - start)) + start; - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; }, stream); } template - void uniformInt(IntType *ptr, LenType len, IntType start, IntType end, - cudaStream_t stream) { - static_assert(std::is_integral::value, - "Type for 'uniformInt' can only be integer!"); + void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream) + { + static_assert(std::is_integral::value, "Type for 'uniformInt' can only be integer!"); custom_distribution( - ptr, len, - [=] __device__(IntType val, LenType idx) { - return (val % (end - start)) + start; - }, + ptr, + len, + [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; }, stream); } /** @} */ @@ -218,28 +219,37 @@ class Rng { * @{ */ template - void normal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'normal' can only be floating point!"); rand2Impl( - offset, ptr, len, + offset, + ptr, + len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma, - cudaStream_t stream) { - static_assert(std::is_integral::value, - "Type for 'normalInt' can only be integer!"); + void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream) + { + static_assert(std::is_integral::value, "Type for 'normalInt' can only be integer!"); rand2Impl( - offset, ptr, len, - [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) { + offset, + ptr, + len, + [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } /** @} */ @@ -264,21 +274,32 @@ class Rng { * @param stream stream where to launch the kernel */ template - void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, - const Type *sigma_vec, Type sigma, cudaStream_t stream) { + void normalTable(Type* ptr, + LenType n_rows, + LenType n_cols, + const Type* mu, + const Type* sigma_vec, + Type sigma, + cudaStream_t stream) + { rand2Impl( - offset, ptr, n_rows * n_cols, + offset, + ptr, + n_rows * n_cols, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { // yikes! use fast-int-div - auto col1 = idx1 % n_cols; - auto col2 = idx2 % n_cols; + auto col1 = idx1 % n_cols; + auto col2 = idx2 % n_cols; auto mean1 = mu[col1]; auto mean2 = mu[col2]; - auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; - auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; + auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; + auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; box_muller_transform(val1, val2, sig1, mean1, sig2, mean2); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } /** @@ -291,7 +312,8 @@ class Rng { * @param stream stream where to launch the kernel */ template - void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { + void fill(Type* ptr, LenType len, Type val, cudaStream_t stream) + { constFillKernel<<>>(ptr, len, val); CUDA_CHECK(cudaPeekAtLastError()); } @@ -309,10 +331,10 @@ class Rng { * @param[in] stream stream where to launch the kernel */ template - void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { + void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream) + { custom_distribution( - ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, - stream); + ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream); } /** @@ -326,15 +348,14 @@ class Rng { * @param stream stream where to launch the kernel */ template - void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, - cudaStream_t stream) { + void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'scaled_bernoulli' can only be floating point!"); custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return val > prob ? -scale : scale; - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; }, stream); } @@ -350,12 +371,12 @@ class Rng { * @note https://en.wikipedia.org/wiki/Gumbel_distribution */ template - void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { + void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream) + { custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return mu - beta * raft::myLog(-raft::myLog(val)); - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); }, stream); } @@ -370,16 +391,21 @@ class Rng { * @param stream stream where to launch the kernel */ template - void lognormal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { rand2Impl( - offset, ptr, len, + offset, + ptr, + len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); val1 = raft::myExp(val1); val2 = raft::myExp(val2); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } /** @@ -393,10 +419,11 @@ class Rng { * @param stream stream where to launch the kernel */ template - void logistic(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return mu - scale * raft::myLog(one / val - one); @@ -414,9 +441,11 @@ class Rng { * @param stream stream where to launch the kernel */ template - void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { + void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return -raft::myLog(one - val) / lambda; @@ -434,9 +463,11 @@ class Rng { * @param stream stream where to launch the kernel */ template - void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { + void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; constexpr Type two = (Type)2.0; @@ -456,13 +487,14 @@ class Rng { * @param stream stream where to launch the kernel */ template - void laplace(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { - constexpr Type one = (Type)1.0; - constexpr Type two = (Type)2.0; + constexpr Type one = (Type)1.0; + constexpr Type two = (Type)2.0; constexpr Type oneHalf = (Type)0.5; Type out; if (val <= oneHalf) { @@ -502,43 +534,44 @@ class Rng { * @param stream cuda stream */ template - void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, - IdxT *outIdx, const DataT *in, - const WeightsT *wts, IdxT sampledLen, IdxT len, - cudaStream_t stream) { - ASSERT(sampledLen <= len, - "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); - - std::shared_ptr allocator = - handle.get_device_allocator(); + void sampleWithoutReplacement(const raft::handle_t& handle, + DataT* out, + IdxT* outIdx, + const DataT* in, + const WeightsT* wts, + IdxT sampledLen, + IdxT len, + cudaStream_t stream) + { + ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); + + std::shared_ptr allocator = handle.get_device_allocator(); raft::mr::device::buffer expWts(allocator, stream, len); raft::mr::device::buffer sortedWts(allocator, stream, len); raft::mr::device::buffer inIdx(allocator, stream, len); raft::mr::device::buffer outIdxBuff(allocator, stream, len); - auto *inIdxPtr = inIdx.data(); + auto* inIdxPtr = inIdx.data(); // generate modified weights custom_distribution( - expWts.data(), len, + expWts.data(), + len, [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) { - inIdxPtr[idx] = idx; + inIdxPtr[idx] = idx; constexpr WeightsT one = (WeightsT)1.0; - auto exp = -raft::myLog(one - val); - if (wts != nullptr) { - return exp / wts[idx]; - } + auto exp = -raft::myLog(one - val); + if (wts != nullptr) { return exp / wts[idx]; } return exp; }, stream); ///@todo: use a more efficient partitioning scheme instead of full sort // sort the array and pick the top sampledLen items - IdxT *outIdxPtr = outIdxBuff.data(); + IdxT* outIdxPtr = outIdxBuff.data(); raft::mr::device::buffer workspace(allocator, stream); - sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, - (int)len, stream); + sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); if (outIdx != nullptr) { - CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync( + outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream)); } scatter(out, in, outIdxPtr, sampledLen, stream); } @@ -558,17 +591,15 @@ class Rng { * @param[in] stream cuda stream * @{ */ - template - void custom_distribution(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { randImpl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } - template - void custom_distribution2(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { rand2Impl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } @@ -591,12 +622,10 @@ class Rng { static const int NumThreads = 256; template - uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len, - int nThreads, int nBlocks) { + uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks) + { LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads)); - if (IsNormal && itemsPerThread % 2 == 1) { - ++itemsPerThread; - } + if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; } // curand uses 2 32b uint's to generate one double uint64_t factor = sizeof(Type) / sizeof(float); if (factor == 0) ++factor; @@ -604,22 +633,26 @@ class Rng { // If not, then generate new seed and start from zero offset uint64_t newOffset = offset + LenType(itemsPerThread) * factor; if (newOffset < offset) { - offset = 0; - seed = gen(); + offset = 0; + seed = gen(); newOffset = itemsPerThread * factor; } return newOffset; } - template - void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp, - int nThreads, int nBlocks, GeneratorType type, - cudaStream_t stream) { + template + void randImpl(uint64_t& offset, + OutType* ptr, + LenType len, + Lambda randOp, + int nThreads, + int nBlocks, + GeneratorType type, + cudaStream_t stream) + { if (len <= 0) return; - uint64_t seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, - nThreads, nBlocks); + uint64_t seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); switch (type) { case GenPhilox: randKernel @@ -633,26 +666,28 @@ class Rng { randKernel <<>>(seed, offset, ptr, len, randOp); break; - default: - ASSERT(false, "randImpl: Incorrect generator type! %d", type); + default: ASSERT(false, "randImpl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; } - template - void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op, - int nThreads, int nBlocks, GeneratorType type, - cudaStream_t stream) { + template + void rand2Impl(uint64_t& offset, + OutType* ptr, + LenType len, + Lambda2 rand2Op, + int nThreads, + int nBlocks, + GeneratorType type, + cudaStream_t stream) + { if (len <= 0) return; - auto seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, - nThreads, nBlocks); + auto seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); switch (type) { case GenPhilox: - rand2Kernel + rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; case GenTaps: @@ -660,12 +695,10 @@ class Rng { <<>>(seed, offset, ptr, len, rand2Op); break; case GenKiss99: - rand2Kernel + rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; - default: - ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); + default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh index d44c6f018b..485f4ddd68 100644 --- a/cpp/include/raft/random/rng_impl.cuh +++ b/cpp/include/raft/random/rng_impl.cuh @@ -33,7 +33,8 @@ struct PhiloxGenerator { * @param subsequence as found in curand docs * @param offset as found in curand docs */ - DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) + { curand_init(seed, subsequence, offset, &state); } @@ -44,18 +45,21 @@ struct PhiloxGenerator { DI void next(float& ret) { ret = curand_uniform(&(this->state)); } DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); } DI void next(uint32_t& ret) { ret = curand(&(this->state)); } - DI void next(uint64_t& ret) { + DI void next(uint64_t& ret) + { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t& ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t& ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -76,8 +80,9 @@ struct TapsGenerator { * @param subsequence unused * @param offset unused */ - DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; + DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) + { + uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; uint64_t stride = blockDim.x * gridDim.x; delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride; stride *= blockDim.y * gridDim.y; @@ -90,31 +95,36 @@ struct TapsGenerator { * @{ */ template - DI void next(Type& ret) { + DI void next(Type& ret) + { constexpr double ULL_LARGE = 1.8446744073709551614e19; uint64_t val; next(val); ret = static_cast(val); ret /= static_cast(ULL_LARGE); } - DI void next(uint64_t& ret) { + DI void next(uint64_t& ret) + { constexpr uint64_t TAPS = 0x8000100040002000ULL; - constexpr int ROUNDS = 128; + constexpr int ROUNDS = 128; for (int i = 0; i < ROUNDS; i++) state = (state >> 1) ^ (-(state & 1ULL) & TAPS); ret = state; } - DI void next(uint32_t& ret) { + DI void next(uint32_t& ret) + { uint64_t val; next(val); ret = (uint32_t)val; } - DI void next(int32_t& ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t& ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -135,46 +145,49 @@ struct Kiss99Generator { * @param subsequence unused * @param offset unused */ - DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - initKiss99(seed); - } + DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); } /** * @defgroup NextRand Generate the next random number * @{ */ template - DI void next(Type& ret) { + DI void next(Type& ret) + { constexpr double U_LARGE = 4.294967295e9; uint32_t val; next(val); ret = static_cast(val); ret /= static_cast(U_LARGE); } - DI void next(uint32_t& ret) { + DI void next(uint32_t& ret) + { uint32_t MWC; - z = 36969 * (z & 65535) + (z >> 16); - w = 18000 * (w & 65535) + (w >> 16); + z = 36969 * (z & 65535) + (z >> 16); + w = 18000 * (w & 65535) + (w >> 16); MWC = ((z << 16) + w); jsr ^= (jsr << 17); jsr ^= (jsr >> 13); jsr ^= (jsr << 5); jcong = 69069 * jcong + 1234567; - MWC = ((MWC ^ jcong) + jsr); - ret = MWC; + MWC = ((MWC ^ jcong) + jsr); + ret = MWC; } - DI void next(uint64_t& ret) { + DI void next(uint64_t& ret) + { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t& ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t& ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -193,7 +206,8 @@ struct Kiss99Generator { // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower // 128 bits. It uses 32-bit wide multiply only. - DI void mulByFnv1a128Prime(uint32_t* h) { + DI void mulByFnv1a128Prime(uint32_t* h) + { typedef union { uint32_t u32[2]; uint64_t u64[1]; @@ -217,12 +231,12 @@ struct Kiss99Generator { // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]); // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]); uint32_t carry = 0; - h[0] = h0p0.u32[0]; + h[0] = h0p0.u32[0]; - h[1] = h0p0.u32[1] + h1p0.u32[0]; + h[1] = h0p0.u32[1] + h1p0.u32[0]; carry = h[1] < h0p0.u32[1] ? 1 : 0; - h[2] = h1p0.u32[1] + carry; + h[2] = h1p0.u32[1] + carry; carry = h[2] < h1p0.u32[1] ? 1 : 0; h[2] += h2p0.u32[0]; carry = h[2] < h2p0.u32[0] ? carry + 1 : carry; @@ -233,7 +247,8 @@ struct Kiss99Generator { return; } - DI void fnv1a128(uint32_t* hash, uint32_t txt) { + DI void fnv1a128(uint32_t* hash, uint32_t txt) + { hash[0] ^= (txt >> 0) & 0xFF; mulByFnv1a128Prime(hash); hash[0] ^= (txt >> 8) & 0xFF; @@ -244,7 +259,8 @@ struct Kiss99Generator { mulByFnv1a128Prime(hash); } - DI void initKiss99(uint64_t seed) { + DI void initKiss99(uint64_t seed) + { // Initialize hash to 128-bit FNV1a basis uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL}; @@ -259,9 +275,9 @@ struct Kiss99Generator { fnv1a128(hash, uint32_t(seed >> 32)); // Initialize KISS99 state with hash - z = hash[0]; - w = hash[1]; - jsr = hash[2]; + z = hash[0]; + w = hash[1]; + jsr = hash[2]; jcong = hash[3]; } }; @@ -273,10 +289,13 @@ struct Kiss99Generator { template struct Generator { DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) - : gen(seed, subsequence, offset) {} + : gen(seed, subsequence, offset) + { + } template - DI void next(Type& ret) { + DI void next(Type& ret) + { gen.next(ret); } diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh index e367550060..5d38bdf4a8 100644 --- a/cpp/include/raft/sparse/convert/coo.cuh +++ b/cpp/include/raft/sparse/convert/coo.cuh @@ -37,14 +37,18 @@ namespace sparse { namespace convert { template -__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, - value_idx *coo_rows, value_idx nnz) { +__global__ void csr_to_coo_kernel(const value_idx* row_ind, + value_idx m, + value_idx* coo_rows, + value_idx nnz) +{ // row-based matrix 1 thread per row value_idx row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { value_idx start_idx = row_ind[row]; - value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); - for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row; + value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); + for (value_idx i = start_idx; i < stop_idx; i++) + coo_rows[i] = row; } } @@ -57,14 +61,14 @@ __global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, * @param stream: cuda stream to use */ template -void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows, - value_idx nnz, cudaStream_t stream) { +void csr_to_coo( + const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream) +{ // @TODO: Use cusparse for this. dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_to_coo_kernel - <<>>(row_ind, m, coo_rows, nnz); + csr_to_coo_kernel<<>>(row_ind, m, coo_rows, nnz); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index a034bdbda8..2191f5edd1 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -44,29 +44,33 @@ namespace sparse { namespace convert { template -void coo_to_csr(const raft::handle_t &handle, const int *srcRows, - const int *srcCols, const value_t *srcVals, int nnz, int m, - int *dst_offsets, int *dstCols, value_t *dstVals) { - auto stream = handle.get_stream(); +void coo_to_csr(const raft::handle_t& handle, + const int* srcRows, + const int* srcCols, + const value_t* srcVals, + int nnz, + int m, + int* dst_offsets, + int* dstCols, + value_t* dstVals) +{ + auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); - auto d_alloc = handle.get_device_allocator(); + auto d_alloc = handle.get_device_allocator(); raft::mr::device::buffer dstRows(d_alloc, stream, nnz); - CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, - cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( cusparseHandle, m, m, nnz, srcRows, srcCols, stream); raft::mr::device::buffer pBuffer(d_alloc, stream, buffSize); raft::mr::device::buffer P(d_alloc, stream, nnz); - CUSPARSE_CHECK( - cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); - raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(), - dstCols, P.data(), pBuffer.data(), stream); - raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), - stream); - raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, - dst_offsets, stream); + CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); + raft::sparse::cusparsecoosortByRow( + cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream); + raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream); + raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -85,14 +89,20 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows, * @param stream cuda stream to use * @param fused_op: the fused operation */ -template void> -void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - Index_ batchSize, const bool *adj, - Index_ *row_ind_ptr, cudaStream_t stream, - Lambda fused_op) { +template void> +void csr_adj_graph_batched(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + Index_ batchSize, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream, + Lambda fused_op) +{ op::csr_row_op( - row_ind, batchSize, nnz, + row_ind, + batchSize, + nnz, [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__( Index_ row, Index_ start_idx, Index_ stop_idx) { fused_op(row, start_idx, stop_idx); @@ -108,14 +118,23 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, stream); } -template void> -void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - Index_ batchSize, const bool *adj, - Index_ *row_ind_ptr, cudaStream_t stream) { - csr_adj_graph_batched( - row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, - [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); +template void> +void csr_adj_graph_batched(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + Index_ batchSize, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream) +{ + csr_adj_graph_batched(row_ind, + total_rows, + nnz, + batchSize, + adj, + row_ind_ptr, + stream, + [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); } /** @@ -131,13 +150,17 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param stream cuda stream to use * @param fused_op the fused operation */ -template void> -void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream, - Lambda fused_op) { - csr_adj_graph_batched(row_ind, total_rows, nnz, total_rows, - adj, row_ind_ptr, stream, fused_op); +template void> +void csr_adj_graph(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream, + Lambda fused_op) +{ + csr_adj_graph_batched( + row_ind, total_rows, nnz, total_rows, adj, row_ind_ptr, stream, fused_op); } /** @@ -151,9 +174,13 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, +void sorted_coo_to_csr(const T* rows, + int nnz, + T* row_ind, + int m, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ raft::mr::device::buffer row_counts(d_alloc, stream, m); CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); @@ -161,11 +188,9 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = - thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, - c_ind_d); + thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); + exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d); } /** @@ -177,11 +202,12 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO *coo, int *row_ind, +void sorted_coo_to_csr(COO* coo, + int* row_ind, std::shared_ptr d_alloc, - cudaStream_t stream) { - sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, - stream); + cudaStream_t stream) +{ + sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, stream); } }; // end NAMESPACE convert diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh index 299f9d36d4..e90882b501 100644 --- a/cpp/include/raft/sparse/convert/dense.cuh +++ b/cpp/include/raft/sparse/convert/dense.cuh @@ -37,22 +37,20 @@ namespace sparse { namespace convert { template -__global__ void csr_to_dense_warp_per_row_kernel(int n_cols, - const value_t *csrVal, - const int *csrRowPtr, - const int *csrColInd, - value_t *a) { +__global__ void csr_to_dense_warp_per_row_kernel( + int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a) +{ int row = blockIdx.x; int tid = threadIdx.x; int colStart = csrRowPtr[row]; - int colEnd = csrRowPtr[row + 1]; - int rowNnz = colEnd - colStart; + int colEnd = csrRowPtr[row + 1]; + int rowNnz = colEnd - colStart; for (int i = tid; i < rowNnz; i += blockDim.x) { int colIdx = colStart + i; if (colIdx < colEnd) { - int col = csrColInd[colIdx]; + int col = csrColInd[colIdx]; a[row * n_cols + col] = csrVal[colIdx]; } } @@ -77,10 +75,17 @@ __global__ void csr_to_dense_warp_per_row_kernel(int n_cols, * @param[in] row_major : Is row-major output desired? */ template -void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, - const value_idx *csr_indptr, const value_idx *csr_indices, - const value_t *csr_data, value_idx lda, value_t *out, - cudaStream_t stream, bool row_major = true) { +void csr_to_dense(cusparseHandle_t handle, + value_idx nrows, + value_idx ncols, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data, + value_idx lda, + value_t* out, + cudaStream_t stream, + bool row_major = true) +{ if (!row_major) { /** * If we need col-major, use cusparse. @@ -91,15 +96,13 @@ void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense( - handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, - lda, stream)); + handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream)); CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat)); } else { int blockdim = block_dim(ncols); - CUDA_CHECK( - cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); + CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); csr_to_dense_warp_per_row_kernel<<>>( ncols, csr_data, csr_indptr, csr_indices, out); } diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index 73120fea8c..348ed5eab2 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -68,83 +68,87 @@ class COO { Index_Type n_cols; /** - * @param d_alloc: the device allocator to use for the underlying buffers - * @param stream: CUDA stream to use - */ + * @param d_alloc: the device allocator to use for the underlying buffers + * @param stream: CUDA stream to use + */ COO(std::shared_ptr d_alloc, cudaStream_t stream) : rows_arr(d_alloc, stream, 0), cols_arr(d_alloc, stream, 0), vals_arr(d_alloc, stream, 0), nnz(0), n_rows(0), - n_cols(0) {} + n_cols(0) + { + } /** - * @param rows: coo rows array - * @param cols: coo cols array - * @param vals: coo vals array - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - */ - COO(raft::mr::device::buffer &rows, - raft::mr::device::buffer &cols, - raft::mr::device::buffer &vals, Index_Type nnz, Index_Type n_rows = 0, + * @param rows: coo rows array + * @param cols: coo cols array + * @param vals: coo vals array + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + */ + COO(raft::mr::device::buffer& rows, + raft::mr::device::buffer& cols, + raft::mr::device::buffer& vals, + Index_Type nnz, + Index_Type n_rows = 0, Index_Type n_cols = 0) - : rows_arr(rows), - cols_arr(cols), - vals_arr(vals), - nnz(nnz), - n_rows(n_rows), - n_cols(n_cols) {} + : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols) + { + } /** - * @param d_alloc: the device allocator use - * @param stream: CUDA stream to use - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - * @param init: initialize arrays with zeros - */ - COO(std::shared_ptr d_alloc, cudaStream_t stream, - Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0, - bool init = true) + * @param d_alloc: the device allocator use + * @param stream: CUDA stream to use + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + * @param init: initialize arrays with zeros + */ + COO(std::shared_ptr d_alloc, + cudaStream_t stream, + Index_Type nnz, + Index_Type n_rows = 0, + Index_Type n_cols = 0, + bool init = true) : rows_arr(d_alloc, stream, nnz), cols_arr(d_alloc, stream, nnz), vals_arr(d_alloc, stream, nnz), nnz(nnz), n_rows(n_rows), - n_cols(n_cols) { + n_cols(n_cols) + { if (init) init_arrays(stream); } - void init_arrays(cudaStream_t stream) { - CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, - this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, - this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK( - cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); + void init_arrays(cudaStream_t stream) + { + CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); } ~COO() {} /** - * @brief Size should be > 0, with the number of rows - * and cols in the dense matrix being > 0. - */ - bool validate_size() const { + * @brief Size should be > 0, with the number of rows + * and cols in the dense matrix being > 0. + */ + bool validate_size() const + { if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false; return true; } /** - * @brief If the underlying arrays have not been set, - * return false. Otherwise true. - */ - bool validate_mem() const { - if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || - this->vals_arr.size() == 0) { + * @brief If the underlying arrays have not been set, + * return false. Otherwise true. + */ + bool validate_mem() const + { + if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || this->vals_arr.size() == 0) { return false; } @@ -154,33 +158,30 @@ class COO { /* * @brief Returns the rows array */ - Index_Type *rows() { return this->rows_arr.data(); } + Index_Type* rows() { return this->rows_arr.data(); } /** * @brief Returns the cols array */ - Index_Type *cols() { return this->cols_arr.data(); } + Index_Type* cols() { return this->cols_arr.data(); } /** * @brief Returns the vals array */ - T *vals() { return this->vals_arr.data(); } + T* vals() { return this->vals_arr.data(); } /** - * @brief Send human-readable state information to output stream - */ - friend std::ostream &operator<<(std::ostream &out, - const COO &c) { + * @brief Send human-readable state information to output stream + */ + friend std::ostream& operator<<(std::ostream& out, const COO& c) + { if (c.validate_size() && c.validate_mem()) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) - << std::endl; - out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) - << std::endl; - out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) - << std::endl; + out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl; + out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl; + out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl; out << "nnz=" << c.nnz << std::endl; out << "n_rows=" << c.n_rows << std::endl; out << "n_cols=" << c.n_cols << std::endl; @@ -194,58 +195,59 @@ class COO { } /** - * @brief Set the number of rows and cols - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - */ - void setSize(int n_rows, int n_cols) { + * @brief Set the number of rows and cols + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + */ + void setSize(int n_rows, int n_cols) + { this->n_rows = n_rows; this->n_cols = n_cols; } /** - * @brief Set the number of rows and cols for a square dense matrix - * @param n: number of rows and cols - */ - void setSize(int n) { + * @brief Set the number of rows and cols for a square dense matrix + * @param n: number of rows and cols + */ + void setSize(int n) + { this->n_rows = n; this->n_cols = n; } /** - * @brief Allocate the underlying arrays - * @param nnz: size of underlying row/col/val arrays - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, bool init, cudaStream_t stream) { - this->allocate(nnz, 0, init, stream); - } + * @brief Allocate the underlying arrays + * @param nnz: size of underlying row/col/val arrays + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param size: the number of rows/cols in a square dense matrix - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, int size, bool init, cudaStream_t stream) { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param size: the number of rows/cols in a square dense matrix + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, int size, bool init, cudaStream_t stream) + { this->allocate(nnz, size, size, init, stream); } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - * @param init: should values be initialized to 0? - * @param stream: stream to use for init - */ - void allocate(int nnz, int n_rows, int n_cols, bool init, - cudaStream_t stream) { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + * @param init: should values be initialized to 0? + * @param stream: stream to use for init + */ + void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream) + { this->n_rows = n_rows; this->n_cols = n_cols; - this->nnz = nnz; + this->nnz = nnz; this->rows_arr.resize(this->nnz, stream); this->cols_arr.resize(this->nnz, stream); diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index bc4a68d296..17f3c735af 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -41,57 +41,64 @@ namespace sparse { struct WeakCCState { public: - bool *m; - WeakCCState(bool *m) : m(m) {} + bool* m; + WeakCCState(bool* m) : m(m) {} }; template -__global__ void weak_cc_label_device(Index_ *__restrict__ labels, - const Index_ *__restrict__ row_ind, - const Index_ *__restrict__ row_ind_ptr, - Index_ nnz, bool *__restrict__ m, - Index_ start_vertex_id, Index_ batch_size, - Index_ N, Lambda filter_op) { - Index_ tid = threadIdx.x + blockIdx.x * TPB_X; +__global__ void weak_cc_label_device(Index_* __restrict__ labels, + const Index_* __restrict__ row_ind, + const Index_* __restrict__ row_ind_ptr, + Index_ nnz, + bool* __restrict__ m, + Index_ start_vertex_id, + Index_ batch_size, + Index_ N, + Lambda filter_op) +{ + Index_ tid = threadIdx.x + blockIdx.x * TPB_X; Index_ global_id = tid + start_vertex_id; if (tid < batch_size && global_id < N) { Index_ start = __ldg(row_ind + tid); Index_ ci, cj; - bool ci_mod = false; - ci = labels[global_id]; + bool ci_mod = false; + ci = labels[global_id]; bool ci_allow_prop = filter_op(global_id); Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind); /// TODO: add one element to row_ind and avoid get_stop_idx for (Index_ j = start; j < end; j++) { - Index_ j_ind = __ldg(row_ind_ptr + j); - cj = labels[j_ind]; + Index_ j_ind = __ldg(row_ind_ptr + j); + cj = labels[j_ind]; bool cj_allow_prop = filter_op(j_ind); if (ci < cj && ci_allow_prop) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + j_ind), ci); + atomicMin((int*)(labels + j_ind), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int *)(labels + j_ind), ci); + atomicMin((long long int*)(labels + j_ind), ci); if (cj_allow_prop) *m = true; } else if (ci > cj && cj_allow_prop) { - ci = cj; + ci = cj; ci_mod = true; } } if (ci_mod) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + global_id), ci); + atomicMin((int*)(labels + global_id), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int *)(labels + global_id), ci); + atomicMin((long long int*)(labels + global_id), ci); if (ci_allow_prop) *m = true; } } } template -__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, - Index_ MAX_LABEL, Lambda filter_op) { +__global__ void weak_cc_init_all_kernel(Index_* labels, + Index_ N, + Index_ MAX_LABEL, + Lambda filter_op) +{ Index_ tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (filter_op(tid)) @@ -123,22 +130,25 @@ __global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc_batched(Index_ *labels, const Index_ *row_ind, - const Index_ *row_ind_ptr, Index_ nnz, Index_ N, - Index_ start_vertex_id, Index_ batch_size, - WeakCCState *state, cudaStream_t stream, - Lambda filter_op) { - ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, - "Index_ should be 4 or 8 bytes"); +template bool> +void weak_cc_batched(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + Index_ start_vertex_id, + Index_ batch_size, + WeakCCState* state, + cudaStream_t stream, + Lambda filter_op) +{ + ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes"); bool host_m; Index_ MAX_LABEL = std::numeric_limits::max(); weak_cc_init_all_kernel - <<>>( - labels, N, MAX_LABEL, filter_op); + <<>>(labels, N, MAX_LABEL, filter_op); CUDA_CHECK(cudaPeekAtLastError()); int n_iters = 0; @@ -147,8 +157,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, weak_cc_label_device <<>>( - labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, - batch_size, N, filter_op); + labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op); CUDA_CHECK(cudaPeekAtLastError()); //** Updating m * @@ -180,12 +189,25 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param stream the cuda stream to use */ template -void weak_cc_batched(Index_ *labels, const Index_ *row_ind, - const Index_ *row_ind_ptr, Index_ nnz, Index_ N, - Index_ start_vertex_id, Index_ batch_size, - WeakCCState *state, cudaStream_t stream) { - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id, - batch_size, state, stream, +void weak_cc_batched(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + Index_ start_vertex_id, + Index_ batch_size, + WeakCCState* state, + cudaStream_t stream) +{ + weak_cc_batched(labels, + row_ind, + row_ind_ptr, + nnz, + N, + start_vertex_id, + batch_size, + state, + stream, [] __device__(Index_ tid) { return true; }); } @@ -213,17 +235,20 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, +template bool> +void weak_cc(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, std::shared_ptr d_alloc, - cudaStream_t stream, Lambda filter_op) { + cudaStream_t stream, + Lambda filter_op) +{ raft::mr::device::buffer m(d_alloc, stream, 1); WeakCCState state(m.data()); - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, - stream, filter_op); + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op); } /** @@ -249,14 +274,18 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, * @param stream the cuda stream to use */ template -void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, +void weak_cc(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ raft::mr::device::buffer m(d_alloc, stream, 1); WeakCCState state(m.data()); - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, - stream, [](Index_) { return true; }); + weak_cc_batched( + labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; }); } }; // namespace sparse diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 360832f557..9d42ec34cb 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -23,10 +23,9 @@ //#include #define _CUSPARSE_ERR_TO_STR(err) \ - case err: \ - return #err; + case err: return #err; -//Notes: +// Notes: //(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic; //(2.) to enforce a lower version, // @@ -43,16 +42,15 @@ namespace raft { * @brief Exception thrown when a cuSparse error is encountered. */ struct cusparse_error : public raft::exception { - explicit cusparse_error(char const* const message) - : raft::exception(message) {} - explicit cusparse_error(std::string const& message) - : raft::exception(message) {} + explicit cusparse_error(char const* const message) : raft::exception(message) {} + explicit cusparse_error(std::string const& message) : raft::exception(message) {} }; namespace sparse { namespace detail { -inline const char* cusparse_error_to_string(cusparseStatus_t err) { +inline const char* cusparse_error_to_string(cusparseStatus_t err) +{ #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 return cusparseGetErrorString(err); #else // CUDART_VERSION @@ -65,8 +63,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); - default: - return "CUSPARSE_STATUS_UNKNOWN"; + default: return "CUSPARSE_STATUS_UNKNOWN"; }; #endif // CUDART_VERSION } @@ -88,8 +85,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { cusparseStatus_t const status = (call); \ if (CUSPARSE_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, "cuSparse error encountered at: ", \ - "call='%s', Reason=%d:%s", #call, status, \ + SET_ERROR_MSG(msg, \ + "cuSparse error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ raft::sparse::detail::cusparse_error_to_string(status)); \ throw raft::cusparse_error(msg); \ } \ @@ -100,13 +100,15 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { //@todo: use logger here once logging is enabled /** check for cusparse runtime API errors but do not assert */ -#define CUSPARSE_CHECK_NO_THROW(call) \ - do { \ - cusparseStatus_t err = call; \ - if (err != CUSPARSE_STATUS_SUCCESS) { \ - printf("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ - raft::sparse::detail::cusparse_error_to_string(err)); \ - } \ +#define CUSPARSE_CHECK_NO_THROW(call) \ + do { \ + cusparseStatus_t err = call; \ + if (err != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE call='%s' got errorcode=%d err=%s", \ + #call, \ + err, \ + raft::sparse::detail::cusparse_error_to_string(err)); \ + } \ } while (0) namespace raft { @@ -117,28 +119,34 @@ namespace sparse { * @{ */ template -cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals, - T* vals_sorted, int* d_P, cudaStream_t stream); +cusparseStatus_t cusparsegthr( + cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, - const double* vals, double* vals_sorted, - int* d_P, cudaStream_t stream) { +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, + int nnz, + const double* vals, + double* vals_sorted, + int* d_P, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, - CUSPARSE_INDEX_BASE_ZERO); + return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, - const float* vals, float* vals_sorted, - int* d_P, cudaStream_t stream) { +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, + int nnz, + const float* vals, + float* vals_sorted, + int* d_P, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, - CUSPARSE_INDEX_BASE_ZERO); + return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } /** @} */ @@ -148,15 +156,18 @@ inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, * @{ */ template -void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz, - int m, T* csrRowPtr, cudaStream_t stream); +void cusparsecoo2csr( + cusparseHandle_t handle, const T* cooRowInd, int nnz, int m, T* csrRowPtr, cudaStream_t stream); template <> -inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, - int nnz, int m, int* csrRowPtr, - cudaStream_t stream) { +inline void cusparsecoo2csr(cusparseHandle_t handle, + const int* cooRowInd, + int nnz, + int m, + int* csrRowPtr, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, - CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -166,30 +177,54 @@ inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, */ template size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows, - const T* cooCols, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* cooRows, + const T* cooCols, + cudaStream_t stream); template <> inline size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows, - const int* cooCols, cudaStream_t stream) { + cusparseHandle_t handle, + int m, + int n, + int nnz, + const int* cooRows, + const int* cooCols, + cudaStream_t stream) +{ size_t val; CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK( - cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); + CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); return val; } template void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P, - void* pBuffer, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int nnz, + T* cooRows, + T* cooCols, + T* P, + void* pBuffer, + cudaStream_t stream); template <> inline void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols, - int* P, void* pBuffer, cudaStream_t stream) { + cusparseHandle_t handle, + int m, + int n, + int nnz, + int* cooRows, + int* cooCols, + int* P, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK( - cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); + CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); } /** @} */ @@ -199,37 +234,67 @@ inline void cusparsecoosortByRow( // NOLINT */ template cusparseStatus_t cusparsegemmi( // NOLINT - cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, - const T* A, int lda, const T* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const T* alpha, + const T* A, + int lda, + const T* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, + const T* beta, + T* C, + int ldc, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, - int k, int nnz, const float* alpha, - const float* A, int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const float* alpha, + const float* A, + int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const float* beta, - float* C, int ldc, cudaStream_t stream) { + const int* cscRowIndB, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, - cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseSgemmi( + handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, - int k, int nnz, const double* alpha, - const double* A, int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const double* alpha, + const double* A, + int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const double* beta, - double* C, int ldc, cudaStream_t stream) { + const int* cscRowIndB, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, - cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseDgemmi( + handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } /** @} */ @@ -241,49 +306,94 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, */ template cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, int64_t nnz, - IndexT* csrRowOffsets, IndexT* csrColInd, + int64_t rows, + int64_t cols, + int64_t nnz, + IndexT* csrRowOffsets, + IndexT* csrColInd, ValueT* csrValues); template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int* csrRowOffsets, - int* csrColInd, float* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, + int64_t cols, + int64_t nnz, + int* csrRowOffsets, + int* csrColInd, + float* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int* csrRowOffsets, - int* csrColInd, double* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, + int64_t cols, + int64_t nnz, + int* csrRowOffsets, + int* csrColInd, + double* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int64_t* csrRowOffsets, + int64_t rows, + int64_t cols, + int64_t nnz, + int64_t* csrRowOffsets, int64_t* csrColInd, - float* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + float* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int64_t* csrRowOffsets, + int64_t rows, + int64_t cols, + int64_t nnz, + int64_t* csrRowOffsets, int64_t* csrColInd, - double* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + double* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } /** @} */ @@ -292,16 +402,19 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, * @{ */ template -cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, T* values); +cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, int64_t size, T* values); template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, float* values) { + int64_t size, + float* values) +{ return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, double* values) { + int64_t size, + double* values) +{ return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F); } /** @} */ @@ -312,23 +425,30 @@ inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, */ template cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, int64_t ld, - T* values, cusparseOrder_t order); + int64_t rows, + int64_t cols, + int64_t ld, + T* values, + cusparseOrder_t order); template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, - int64_t ld, float* values, - cusparseOrder_t order) { - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, - order); + int64_t rows, + int64_t cols, + int64_t ld, + float* values, + cusparseOrder_t order) +{ + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, order); } template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, - int64_t ld, double* values, - cusparseOrder_t order) { - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, - order); + int64_t rows, + int64_t cols, + int64_t ld, + double* values, + cusparseOrder_t order) +{ + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order); } /** @} */ @@ -337,58 +457,89 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, * @{ */ template -cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const T* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const float* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - CUDA_R_32F, alg, bufferSize); + return cusparseSpMV_bufferSize( + handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const double* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - CUDA_R_64F, alg, bufferSize); + return cusparseSpMV_bufferSize( + handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, bufferSize); } template -cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, const T* beta, +cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const T* beta, const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, T* externalBuffer, + cusparseSpMVAlg_t alg, + T* externalBuffer, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - float* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const float* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + float* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, - alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - double* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const double* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + double* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, - alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer); } /** @} */ #else @@ -398,29 +549,59 @@ inline cusparseStatus_t cusparsespmv( */ template cusparseStatus_t cusparsecsrmv( // NOLINT - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y, + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const T* beta, + T* y, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmv( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const float* alpha, const cusparseMatDescr_t descr, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta, - float* y, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const float* beta, + float* y, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, beta, y); + return cusparseScsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); } template <> -inline cusparseStatus_t cusparsecsrmv( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const double* alpha, const cusparseMatDescr_t descr, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, const double* x, - const double* beta, double* y, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const double* beta, + double* y, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, beta, y); + return cusparseDcsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); } /** @} */ #endif @@ -431,58 +612,96 @@ inline cusparseStatus_t cusparsecsrmv( * @{ */ template -cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const float* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const float* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, CUDA_R_32F, alg, bufferSize); + return cusparseSpMM_bufferSize( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const double* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const double* beta, - cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize, - cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const double* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, CUDA_R_64F, alg, bufferSize); + return cusparseSpMM_bufferSize( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, bufferSize); } template -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream); +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + T* externalBuffer, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const float* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const float* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + float* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - CUDA_R_32F, alg, externalBuffer); + return cusparseSpMM( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const double* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const double* beta, - cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const double* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + double* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - CUDA_R_64F, alg, externalBuffer); + return cusparseSpMM( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, externalBuffer); } /** @} */ #else @@ -492,31 +711,68 @@ inline cusparseStatus_t cusparsespmm( */ template cusparseStatus_t cusparsecsrmm( // NOLINT - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx, - const T* beta, T* y, const int ldy, cudaStream_t stream); + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const int ldx, + const T* beta, + T* y, + const int ldy, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmm( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const float* alpha, const cusparseMatDescr_t descr, - const float* csrVal, const int* csrRowPtr, const int* csrColInd, - const float* x, const int ldx, const float* beta, float* y, const int ldy, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const int ldx, + const float* beta, + float* y, + const int ldy, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseScsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } template <> -inline cusparseStatus_t cusparsecsrmm( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const double* alpha, const cusparseMatDescr_t descr, - const double* csrVal, const int* csrRowPtr, const int* csrColInd, - const double* x, const int ldx, const double* beta, double* y, const int ldy, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const int ldx, + const double* beta, + double* y, + const int ldy, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseDcsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } /** @} */ #endif @@ -527,15 +783,22 @@ inline cusparseStatus_t cusparsecsrmm( */ template void cusparsecsr2coo( // NOLINT - cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr, - T* cooRowInd, cudaStream_t stream); + cusparseHandle_t handle, + const int n, + const int nnz, + const T* csrRowPtr, + T* cooRowInd, + cudaStream_t stream); template <> -inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, - const int* csrRowPtr, int* cooRowInd, - cudaStream_t stream) { +inline void cusparsecsr2coo(cusparseHandle_t handle, + const int n, + const int nnz, + const int* csrRowPtr, + int* cooRowInd, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, - CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -553,7 +816,8 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, // template<> inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, cusparsePointerMode_t mode, - cudaStream_t stream) { + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSetPointerMode(handle, mode); } @@ -564,69 +828,203 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, * @{ */ template -cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, - const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, - const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const float* x, const float* beta, float* y, size_t* bufferSizeInBytes, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize( - handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA, - CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y, - CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes); -} -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, - const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const double* x, const double* beta, double* y, size_t* bufferSizeInBytes, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize( - handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA, - CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y, - CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes); +cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* x, + const T* beta, + T* y, + size_t* bufferSizeInBytes, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* x, + const float* beta, + float* y, + size_t* bufferSizeInBytes, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_32F, + descrA, + csrValA, + CUDA_R_32F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_32F, + beta, + CUDA_R_32F, + y, + CUDA_R_32F, + CUDA_R_32F, + bufferSizeInBytes); +} +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* x, + const double* beta, + double* y, + size_t* bufferSizeInBytes, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_64F, + descrA, + csrValA, + CUDA_R_64F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_64F, + beta, + CUDA_R_64F, + y, + CUDA_R_64F, + CUDA_R_64F, + bufferSizeInBytes); } template -cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, - const T* beta, T* y, T* buffer, cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, - const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const float* x, const float* beta, float* y, float* buffer, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, - descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x, - CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F, - CUDA_R_32F, buffer); -} -template <> -inline cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, - const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const double* x, const double* beta, double* y, double* buffer, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, - descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x, - CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F, - CUDA_R_64F, buffer); +cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* x, + const T* beta, + T* y, + T* buffer, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* x, + const float* beta, + float* y, + float* buffer, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_32F, + descrA, + csrValA, + CUDA_R_32F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_32F, + beta, + CUDA_R_32F, + y, + CUDA_R_32F, + CUDA_R_32F, + buffer); +} +template <> +inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* x, + const double* beta, + double* y, + double* buffer, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_64F, + descrA, + csrValA, + CUDA_R_64F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_64F, + beta, + CUDA_R_64F, + y, + CUDA_R_64F, + CUDA_R_64F, + buffer); } /** @} */ @@ -637,68 +1035,180 @@ inline cusparseStatus_t cusparsecsrmvex( */ template -cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize( - handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, - cscRowInd, CUDA_R_32F, copyValues, idxBase, alg, bufferSize); + return cusparseCsr2cscEx2_bufferSize(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_32F, + copyValues, + idxBase, + alg, + bufferSize); } template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize( - handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, - cscRowInd, CUDA_R_64F, copyValues, idxBase, alg, bufferSize); + return cusparseCsr2cscEx2_bufferSize(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_64F, + copyValues, + idxBase, + alg, + bufferSize); } template -cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, - cscVal, cscColPtr, cscRowInd, CUDA_R_32F, - copyValues, idxBase, alg, buffer); + return cusparseCsr2cscEx2(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_32F, + copyValues, + idxBase, + alg, + buffer); } template <> -inline cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, - cscVal, cscColPtr, cscRowInd, CUDA_R_64F, - copyValues, idxBase, alg, buffer); + return cusparseCsr2cscEx2(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_64F, + copyValues, + idxBase, + alg, + buffer); } /** @} */ @@ -709,120 +1219,329 @@ inline cusparseStatus_t cusparsecsr2csc( */ template -cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const T* alpha, const T* beta, - const cusparseMatDescr_t matA, int nnzA, const int* rowindA, - const int* indicesA, const cusparseMatDescr_t matB, int nnzB, - const int* rowindB, const int* indicesB, const cusparseMatDescr_t matD, - int nnzD, const int* rowindD, const int* indicesD, csrgemm2Info_t info, - size_t* pBufferSizeInBytes, cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const float* alpha, - const float* beta, const cusparseMatDescr_t matA, int nnzA, - const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, - int nnzB, const int* rowindB, const int* indicesB, - const cusparseMatDescr_t matD, int nnzD, const int* rowindD, - const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, - cudaStream_t stream) { +cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const T* alpha, + const T* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const float* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2_bufferSizeExt( - handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, - indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); + return cusparseScsrgemm2_bufferSizeExt(handle, + m, + n, + k, + alpha, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + beta, + matD, + nnzD, + rowindD, + indicesD, + info, + pBufferSizeInBytes); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const double* alpha, - const double* beta, const cusparseMatDescr_t matA, int nnzA, - const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, - int nnzB, const int* rowindB, const int* indicesB, - const cusparseMatDescr_t matD, int nnzD, const int* rowindD, - const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const double* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2_bufferSizeExt( - handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, - indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); + return cusparseDcsrgemm2_bufferSizeExt(handle, + m, + n, + k, + alpha, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + beta, + matD, + nnzD, + rowindD, + indicesD, + info, + pBufferSizeInBytes); #pragma GCC diagnostic pop } -inline cusparseStatus_t cusparsecsrgemm2nnz( - cusparseHandle_t handle, int m, int n, int k, const cusparseMatDescr_t matA, - int nnzA, const int* rowindA, const int* indicesA, - const cusparseMatDescr_t matB, int nnzB, const int* rowindB, - const int* indicesB, const cusparseMatDescr_t matD, int nnzD, - const int* rowindD, const int* indicesD, const cusparseMatDescr_t matC, - int* rowindC, int* nnzC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2nnz(cusparseHandle_t handle, + int m, + int n, + int k, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + const cusparseMatDescr_t matC, + int* rowindC, + int* nnzC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseXcsrgemm2Nnz(handle, m, n, k, matA, nnzA, rowindA, indicesA, - matB, nnzB, rowindB, indicesB, matD, nnzD, - rowindD, indicesD, matC, rowindC, nnzC, info, + return cusparseXcsrgemm2Nnz(handle, + m, + n, + k, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + matD, + nnzD, + rowindD, + indicesD, + matC, + rowindC, + nnzC, + info, pBuffer); #pragma GCC diagnostic pop } template -cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const T* alpha, - const cusparseMatDescr_t descrA, int nnzA, const T* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const T* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const T* beta, const cusparseMatDescr_t descrD, int nnzD, const T* csrValD, - const int* csrRowPtrD, const int* csrColIndD, const cusparseMatDescr_t descrC, - T* csrValC, const int* csrRowPtrC, int* csrColIndC, const csrgemm2Info_t info, - void* pBuffer, cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const float* alpha, - const cusparseMatDescr_t descrA, int nnzA, const float* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const float* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const float* beta, const cusparseMatDescr_t descrD, int nnzD, - const float* csrValD, const int* csrRowPtrD, const int* csrColIndD, - const cusparseMatDescr_t descrC, float* csrValC, const int* csrRowPtrC, - int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const T* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const T* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const T* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const T* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + T* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const float* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const float* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const float* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + float* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, - csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, - csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, - csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, - csrColIndC, info, pBuffer); + return cusparseScsrgemm2(handle, + m, + n, + k, + alpha, + descrA, + nnzA, + csrValA, + csrRowPtrA, + csrColIndA, + descrB, + nnzB, + csrValB, + csrRowPtrB, + csrColIndB, + beta, + descrD, + nnzD, + csrValD, + csrRowPtrD, + csrColIndD, + descrC, + csrValC, + csrRowPtrC, + csrColIndC, + info, + pBuffer); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const double* alpha, - const cusparseMatDescr_t descrA, int nnzA, const double* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const double* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const double* beta, const cusparseMatDescr_t descrD, int nnzD, - const double* csrValD, const int* csrRowPtrD, const int* csrColIndD, - const cusparseMatDescr_t descrC, double* csrValC, const int* csrRowPtrC, - int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const double* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const double* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const double* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + double* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, - csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, - csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, - csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, - csrColIndC, info, pBuffer); + return cusparseDcsrgemm2(handle, + m, + n, + k, + alpha, + descrA, + nnzA, + csrValA, + csrRowPtrA, + csrColIndA, + descrB, + nnzB, + csrValB, + csrRowPtrB, + csrColIndB, + beta, + descrD, + nnzD, + csrValD, + csrRowPtrD, + csrColIndD, + descrC, + csrValC, + csrRowPtrC, + csrColIndC, + info, + pBuffer); #pragma GCC diagnostic pop } @@ -834,33 +1553,46 @@ inline cusparseStatus_t cusparsecsrgemm2( */ template -cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, - const int* csrColIndA, T* A, int lda, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + T* A, + int lda, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, const float* csrValA, const int* csrRowPtrA, - const int* csrColIndA, float* A, - int lda, cudaStream_t stream) { + const int* csrColIndA, + float* A, + int lda, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, - csrColIndA, A, lda); + return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); } template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, const double* csrValA, const int* csrRowPtrA, - const int* csrColIndA, double* A, - int lda, cudaStream_t stream) { + const int* csrColIndA, + double* A, + int lda, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, - csrColIndA, A, lda); + return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); } /** @} */ diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh index f3109556b7..aef19122da 100644 --- a/cpp/include/raft/sparse/distance/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/bin_distance.cuh @@ -37,9 +37,11 @@ namespace distance { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_binary_row_norm_kernel( - value_t *out, const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, value_idx nnz) { +__global__ void compute_binary_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; if (i < nnz) { // We do conditional here only because it's @@ -51,55 +53,64 @@ __global__ void compute_binary_row_norm_kernel( } template -__global__ void compute_binary_warp_kernel(value_t *__restrict__ C, - const value_t *__restrict__ Q_norms, - const value_t *__restrict__ R_norms, - value_idx n_rows, value_idx n_cols, - expansion_f expansion_func) { +__global__ void compute_binary_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t q_norm = Q_norms[i]; - value_t r_norm = R_norms[j]; - value_t dot = C[(size_t)i * n_cols + j]; + value_t q_norm = Q_norms[i]; + value_t r_norm = R_norms[j]; + value_t dot = C[(size_t)i * n_cols + j]; C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm); } -template -void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms, - value_idx n_rows, value_idx n_cols, - expansion_f expansion_func, cudaStream_t stream) { +template +void compute_binary(value_t* C, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func, + cudaStream_t stream) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_binary_warp_kernel<<>>( C, Q_norms, R_norms, n_rows, n_cols, expansion_func); } -template -void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, +template +void compute_bin_distance(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, std::shared_ptr alloc, - cudaStream_t stream, expansion_f expansion_func) { + cudaStream_t stream, + expansion_f expansion_func) +{ rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_binary_row_norm_kernel<<>>( Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_binary_row_norm_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, - stream); + compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream); } /** @@ -109,44 +120,52 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, template class jaccard_expanded_distances_t : public distances_t { public: - explicit jaccard_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit jaccard_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t denom = q_r_union - dot; - - value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); - - // flip the similarity when both rows are 0 - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * jacc) + both_empty); - }); + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_device_allocator(), + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t denom = q_r_union - dot; + + value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); + + // flip the similarity when both rows are 0 + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * jacc) + both_empty); + }); } ~jaccard_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -158,40 +177,48 @@ class jaccard_expanded_distances_t : public distances_t { template class dice_expanded_distances_t : public distances_t { public: - explicit dice_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit dice_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t dice = (2 * dot) / q_r_union; - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * dice) + both_empty); - }); + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_device_allocator(), + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t dice = (2 * dot) / q_r_union; + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * dice) + both_empty); + }); } ~dice_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h index 1c55412eec..29c823bcdb 100644 --- a/cpp/include/raft/sparse/distance/common.h +++ b/cpp/include/raft/sparse/distance/common.h @@ -24,31 +24,31 @@ namespace distance { template struct distances_config_t { - distances_config_t(const raft::handle_t &handle_) : handle(handle_) {} + distances_config_t(const raft::handle_t& handle_) : handle(handle_) {} // left side value_idx a_nrows; value_idx a_ncols; value_idx a_nnz; - value_idx *a_indptr; - value_idx *a_indices; - value_t *a_data; + value_idx* a_indptr; + value_idx* a_indices; + value_t* a_data; // right side value_idx b_nrows; value_idx b_ncols; value_idx b_nnz; - value_idx *b_indptr; - value_idx *b_indices; - value_t *b_data; + value_idx* b_indptr; + value_idx* b_indices; + value_t* b_data; - const raft::handle_t &handle; + const raft::handle_t& handle; }; template class distances_t { public: - virtual void compute(value_t *out) {} + virtual void compute(value_t* out) {} virtual ~distances_t() = default; }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh index 3a78f9ada0..cdf1be0c68 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh @@ -41,19 +41,29 @@ namespace raft { namespace sparse { namespace distance { -template inline void balanced_coo_pairwise_generalized_spmv( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_b, product_f product_func, accum_f accum_func, - write_f write_func, strategy_t strategy, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync( - out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); - - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, - chunk_size); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + CUDA_CHECK(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); + + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); }; /** @@ -89,39 +99,55 @@ inline void balanced_coo_pairwise_generalized_spmv( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_b, product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync( - out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ + CUDA_CHECK(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); int max_cols = max_cols_per_block(); if (max_cols > config_.a_ncols) { - dense_smem_strategy strategy( - config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, - write_func, chunk_size); + dense_smem_strategy strategy(config_); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, - write_func, chunk_size); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); } }; -template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_a, product_f product_func, accum_f accum_func, - write_f write_func, strategy_t strategy, int chunk_size = 500000) { - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); }; /** @@ -160,24 +186,30 @@ inline void balanced_coo_pairwise_generalized_spmv_rev( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_a, product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size = 500000) { + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ // try dense first int max_cols = max_cols_per_block(); if (max_cols > config_.b_ncols) { - dense_smem_strategy strategy( - config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + dense_smem_strategy strategy(config_); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); } }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh index 5ace978a23..7a83e73183 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh @@ -32,58 +32,114 @@ namespace distance { template class coo_spmv_strategy { public: - coo_spmv_strategy(const distances_config_t &config_) - : config(config_) { + coo_spmv_strategy(const distances_config_t& config_) : config(config_) + { smem = raft::getSharedMemPerBlock(); } - template - void _dispatch_base(strategy_t &strategy, int smem_dim, indptr_it &a_indptr, - value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size, int n_blocks, - int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig( - balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base(strategy_t& strategy, + int smem_dim, + indptr_it& a_indptr, + value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>( - strategy, a_indptr, config.a_indices, config.a_data, config.a_nnz, - coo_rows_b, config.b_indices, config.b_data, config.a_nrows, - config.b_nrows, smem_dim, config.b_nnz, out_dists, n_blocks_per_row, - chunk_size, config.b_ncols, product_func, accum_func, write_func); + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + a_indptr, + config.a_indices, + config.a_data, + config.a_nnz, + coo_rows_b, + config.b_indices, + config.b_data, + config.a_nrows, + config.b_nrows, + smem_dim, + config.b_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.b_ncols, + product_func, + accum_func, + write_func); } - template - void _dispatch_base_rev(strategy_t &strategy, int smem_dim, - indptr_it &b_indptr, value_t *out_dists, - value_idx *coo_rows_a, product_f product_func, - accum_f accum_func, write_f write_func, - int chunk_size, int n_blocks, int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig( - balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base_rev(strategy_t& strategy, + int smem_dim, + indptr_it& b_indptr, + value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>( - strategy, b_indptr, config.b_indices, config.b_data, config.b_nnz, - coo_rows_a, config.a_indices, config.a_data, config.b_nrows, - config.a_nrows, smem_dim, config.a_nnz, out_dists, n_blocks_per_row, - chunk_size, config.a_ncols, product_func, accum_func, write_func); + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + b_indptr, + config.b_indices, + config.b_data, + config.b_nnz, + coo_rows_a, + config.a_indices, + config.a_data, + config.b_nrows, + config.a_nrows, + smem_dim, + config.a_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.a_ncols, + product_func, + accum_func, + write_func); } protected: int smem; - const distances_config_t &config; + const distances_config_t& config; }; } // namespace distance diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh index 44c3833f96..6586067b56 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -29,11 +29,15 @@ namespace distance { template class mask_row_it { public: - mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, - value_idx *mask_row_idx_ = NULL) - : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) {} + mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_ = NULL) + : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) + { + } - __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { if (mask_row_idx != NULL) { return mask_row_idx[blockIdx.x / n_blocks_nnz_b]; } else { @@ -41,37 +45,49 @@ class mask_row_it { } } - __device__ inline void get_row_offsets( - const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, - const value_idx &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const value_idx& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { start_offset = full_indptr[row_idx]; - stop_offset = full_indptr[row_idx + 1] - 1; + stop_offset = full_indptr[row_idx + 1] - 1; } - __device__ constexpr inline void get_indices_boundary( - const value_idx *indices, value_idx &indices_len, value_idx &start_offset, - value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, - bool &first_a_chunk, bool &last_a_chunk) { + __device__ constexpr inline void get_indices_boundary(const value_idx* indices, + value_idx& indices_len, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { // do nothing; } - __device__ constexpr inline bool check_indices_bounds( - value_idx &start_index_a, value_idx &stop_index_a, value_idx &index_b) { + __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { return true; } const value_idx *full_indptr, &n_rows; - value_idx *mask_row_idx; + value_idx* mask_row_idx; }; template -__global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row, - value_idx *chunk_indices, - value_idx n_rows) { +__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row, + value_idx* chunk_indices, + value_idx n_rows) +{ auto tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n_rows) { auto start = n_chunks_per_row[tid]; - auto end = n_chunks_per_row[tid + 1]; + auto end = n_chunks_per_row[tid + 1]; #pragma unroll for (int i = start; i < end; i++) { @@ -83,73 +99,89 @@ __global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row, template class chunked_mask_row_it : public mask_row_it { public: - chunked_mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, - value_idx *mask_row_idx_, int row_chunk_size_, - const value_idx *n_chunks_per_row_, - const value_idx *chunk_indices_, + chunked_mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_, + int row_chunk_size_, + const value_idx* n_chunks_per_row_, + const value_idx* chunk_indices_, const cudaStream_t stream_) : mask_row_it(full_indptr_, n_rows_, mask_row_idx_), row_chunk_size(row_chunk_size_), n_chunks_per_row(n_chunks_per_row_), chunk_indices(chunk_indices_), - stream(stream_) {} + stream(stream_) + { + } - static void init(const value_idx *indptr, const value_idx *mask_row_idx, - const value_idx &n_rows, const int row_chunk_size, - rmm::device_uvector &n_chunks_per_row, - rmm::device_uvector &chunk_indices, - cudaStream_t stream) { + static void init(const value_idx* indptr, + const value_idx* mask_row_idx, + const value_idx& n_rows, + const int row_chunk_size, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { auto policy = rmm::exec_policy(stream); constexpr value_idx first_element = 0; n_chunks_per_row.set_element_async(0, first_element, stream); n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size); - thrust::transform(policy, mask_row_idx, mask_row_idx + n_rows, - n_chunks_per_row.begin() + 1, chunk_functor); + thrust::transform( + policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor); - thrust::inclusive_scan(policy, n_chunks_per_row.begin() + 1, - n_chunks_per_row.end(), - n_chunks_per_row.begin() + 1); + thrust::inclusive_scan( + policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1); - raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, - stream); + raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream); fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream); } - __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]]; } - __device__ inline void get_row_offsets( - const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, - const int &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { - auto chunk_index = blockIdx.x / n_blocks_nnz_b; - auto chunk_val = chunk_indices[chunk_index]; - auto prev_n_chunks = n_chunks_per_row[chunk_val]; + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const int& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { + auto chunk_index = blockIdx.x / n_blocks_nnz_b; + auto chunk_val = chunk_indices[chunk_index]; + auto prev_n_chunks = n_chunks_per_row[chunk_val]; auto relative_chunk = chunk_index - prev_n_chunks; - first_a_chunk = relative_chunk == 0; + first_a_chunk = relative_chunk == 0; start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size; - stop_offset = start_offset + row_chunk_size; + stop_offset = start_offset + row_chunk_size; auto final_stop_offset = this->full_indptr[row_idx + 1]; last_a_chunk = stop_offset >= final_stop_offset; - stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; + stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; } - __device__ inline void get_indices_boundary( - const value_idx *indices, value_idx &row_idx, value_idx &start_offset, - value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, - bool &first_a_chunk, bool &last_a_chunk) { + __device__ inline void get_indices_boundary(const value_idx* indices, + value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1; - stop_index = last_a_chunk ? stop_index : indices[stop_offset]; + stop_index = last_a_chunk ? stop_index : indices[stop_offset]; } - __device__ inline bool check_indices_bounds(value_idx &start_index_a, - value_idx &stop_index_a, - value_idx &index_b) { + __device__ inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { return (index_b >= start_index_a && index_b <= stop_index_a); } @@ -160,30 +192,34 @@ class chunked_mask_row_it : public mask_row_it { struct n_chunks_per_row_functor { public: - n_chunks_per_row_functor(const value_idx *indptr_, - value_idx row_chunk_size_) - : indptr(indptr_), row_chunk_size(row_chunk_size_) {} + n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_) + : indptr(indptr_), row_chunk_size(row_chunk_size_) + { + } - __host__ __device__ value_idx operator()(const value_idx &i) { + __host__ __device__ value_idx operator()(const value_idx& i) + { auto degree = indptr[i + 1] - indptr[i]; return raft::ceildiv(degree, (value_idx)row_chunk_size); } - const value_idx *indptr; + const value_idx* indptr; value_idx row_chunk_size; }; private: - static void fill_chunk_indices( - const value_idx &n_rows, rmm::device_uvector &n_chunks_per_row, - rmm::device_uvector &chunk_indices, cudaStream_t stream) { + static void fill_chunk_indices(const value_idx& n_rows, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { auto n_threads = std::min(n_rows, 256); - auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); + auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); chunk_indices.resize(total_row_blocks, stream); - fill_chunk_indices_kernel<<>>( - n_chunks_per_row.data(), chunk_indices.data(), n_rows); + fill_chunk_indices_kernel + <<>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows); } }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh index c463654a3b..aac98d6b02 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh @@ -25,71 +25,91 @@ namespace distance { template class dense_smem_strategy : public coo_spmv_strategy { public: - using smem_type = value_t *; + using smem_type = value_t*; using insert_type = smem_type; - using find_type = smem_type; + using find_type = smem_type; - dense_smem_strategy(const distances_config_t &config_) - : coo_spmv_strategy(config_) {} + dense_smem_strategy(const distances_config_t& config_) + : coo_spmv_strategy(config_) + { + } - inline static int smem_per_block(int n_cols) { - return (n_cols * sizeof(value_t)) + - ((1024 / raft::warp_size()) * sizeof(value_t)); + inline static int smem_per_block(int n_cols) + { + return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t)); } template - void dispatch(value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, write_f write_func, - int chunk_size) { - auto n_blocks_per_row = - raft::ceildiv(this->config.b_nnz, chunk_size * 1024); - auto n_blocks = this->config.a_nrows * n_blocks_per_row; - - mask_row_it a_indptr(this->config.a_indptr, - this->config.a_nrows); - - this->_dispatch_base(*this, this->config.b_ncols, a_indptr, out_dists, - coo_rows_b, product_func, accum_func, write_func, - chunk_size, n_blocks, n_blocks_per_row); + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024); + auto n_blocks = this->config.a_nrows * n_blocks_per_row; + + mask_row_it a_indptr(this->config.a_indptr, this->config.a_nrows); + + this->_dispatch_base(*this, + this->config.b_ncols, + a_indptr, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); } template - void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size) { - auto n_blocks_per_row = - raft::ceildiv(this->config.a_nnz, chunk_size * 1024); - auto n_blocks = this->config.b_nrows * n_blocks_per_row; - - mask_row_it b_indptr(this->config.b_indptr, - this->config.b_nrows); - - this->_dispatch_base_rev(*this, this->config.a_ncols, b_indptr, out_dists, - coo_rows_a, product_func, accum_func, write_func, - chunk_size, n_blocks, n_blocks_per_row); + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024); + auto n_blocks = this->config.b_nrows * n_blocks_per_row; + + mask_row_it b_indptr(this->config.b_indptr, this->config.b_nrows); + + this->_dispatch_base_rev(*this, + this->config.a_ncols, + b_indptr, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); } - __device__ inline insert_type init_insert(smem_type cache, - const value_idx &cache_size) { + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { for (int k = threadIdx.x; k < cache_size; k += blockDim.x) { cache[k] = 0.0; } return cache; } - __device__ inline void insert(insert_type cache, const value_idx &key, - const value_t &value) { + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { cache[key] = value; } - __device__ inline find_type init_find(smem_type cache, - const value_idx &cache_size) { + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { return cache; } - __device__ inline value_t find(find_type cache, const value_idx &key) { - return cache[key]; - } + __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; } }; } // namespace distance diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh index 1295d24103..3f8f4b21ad 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh @@ -1,18 +1,18 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once @@ -38,177 +38,238 @@ template class hash_strategy : public coo_spmv_strategy { public: using insert_type = - typename cuco::static_map::device_mutable_view; - using smem_type = typename insert_type::slot_type *; + typename cuco::static_map::device_mutable_view; + using smem_type = typename insert_type::slot_type*; using find_type = - typename cuco::static_map::device_view; + typename cuco::static_map::device_view; - hash_strategy(const distances_config_t &config_, - float capacity_threshold_ = 0.5, int map_size_ = get_map_size()) + hash_strategy(const distances_config_t& config_, + float capacity_threshold_ = 0.5, + int map_size_ = get_map_size()) : coo_spmv_strategy(config_), capacity_threshold(capacity_threshold_), - map_size(map_size_) {} + map_size(map_size_) + { + } - void chunking_needed(const value_idx *indptr, const value_idx n_rows, - rmm::device_uvector &mask_indptr, - std::tuple &n_rows_divided, - cudaStream_t stream) { + void chunking_needed(const value_idx* indptr, + const value_idx n_rows, + rmm::device_uvector& mask_indptr, + std::tuple& n_rows_divided, + cudaStream_t stream) + { auto policy = rmm::exec_policy(stream); - auto less = thrust::copy_if( - policy, thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), mask_indptr.data(), - fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); + auto less = thrust::copy_if(policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + mask_indptr.data(), + fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); std::get<0>(n_rows_divided) = less - mask_indptr.data(); auto more = thrust::copy_if( - policy, thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), less, - fits_in_hash_table(indptr, capacity_threshold * map_size, - std::numeric_limits::max())); + policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + less, + fits_in_hash_table( + indptr, capacity_threshold * map_size, std::numeric_limits::max())); std::get<1>(n_rows_divided) = more - less; } template - void dispatch(value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, write_f write_func, - int chunk_size) { + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr( - this->config.a_nrows, this->config.handle.get_stream()); + rmm::device_uvector mask_indptr(this->config.a_nrows, + this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.a_indptr, this->config.a_nrows, mask_indptr, - n_rows_divided, this->config.handle.get_stream()); + chunking_needed(this->config.a_indptr, + this->config.a_nrows, + mask_indptr, + n_rows_divided, + this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.a_indptr, less_rows, - mask_indptr.data()); + mask_row_it less(this->config.a_indptr, less_rows, mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base(*this, map_size, less, out_dists, coo_rows_b, - product_func, accum_func, write_func, chunk_size, - n_less_blocks, n_blocks_per_row); + this->_dispatch_base(*this, + map_size, + less, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row( - more_rows + 1, this->config.handle.get_stream()); - rmm::device_uvector chunk_indices( - 0, this->config.handle.get_stream()); - chunked_mask_row_it::init( - this->config.a_indptr, mask_indptr.data() + less_rows, more_rows, - capacity_threshold * map_size, n_chunks_per_row, chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more( - this->config.a_indptr, more_rows, mask_indptr.data() + less_rows, - capacity_threshold * map_size, n_chunks_per_row.data(), - chunk_indices.data(), this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row(more_rows + 1, + this->config.handle.get_stream()); + rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); + chunked_mask_row_it::init(this->config.a_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more(this->config.a_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base(*this, map_size, more, out_dists, coo_rows_b, - product_func, accum_func, write_func, chunk_size, - n_more_blocks, n_blocks_per_row); + this->_dispatch_base(*this, + map_size, + more, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); } } template - void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size) { + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr( - this->config.b_nrows, this->config.handle.get_stream()); + rmm::device_uvector mask_indptr(this->config.b_nrows, + this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.b_indptr, this->config.b_nrows, mask_indptr, - n_rows_divided, this->config.handle.get_stream()); + chunking_needed(this->config.b_indptr, + this->config.b_nrows, + mask_indptr, + n_rows_divided, + this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.b_indptr, less_rows, - mask_indptr.data()); + mask_row_it less(this->config.b_indptr, less_rows, mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base_rev(*this, map_size, less, out_dists, coo_rows_a, - product_func, accum_func, write_func, chunk_size, - n_less_blocks, n_blocks_per_row); + this->_dispatch_base_rev(*this, + map_size, + less, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row( - more_rows + 1, this->config.handle.get_stream()); - rmm::device_uvector chunk_indices( - 0, this->config.handle.get_stream()); - chunked_mask_row_it::init( - this->config.b_indptr, mask_indptr.data() + less_rows, more_rows, - capacity_threshold * map_size, n_chunks_per_row, chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more( - this->config.b_indptr, more_rows, mask_indptr.data() + less_rows, - capacity_threshold * map_size, n_chunks_per_row.data(), - chunk_indices.data(), this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row(more_rows + 1, + this->config.handle.get_stream()); + rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); + chunked_mask_row_it::init(this->config.b_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more(this->config.b_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base_rev(*this, map_size, more, out_dists, coo_rows_a, - product_func, accum_func, write_func, chunk_size, - n_more_blocks, n_blocks_per_row); + this->_dispatch_base_rev(*this, + map_size, + more, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); } } - __device__ inline insert_type init_insert(smem_type cache, - const value_idx &cache_size) { + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { return insert_type::make_from_uninitialized_slots( cooperative_groups::this_thread_block(), cache, cache_size, -1, 0); } - __device__ inline void insert(insert_type cache, const value_idx &key, - const value_t &value) { + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { auto success = cache.insert(cuco::pair(key, value)); } - __device__ inline find_type init_find(smem_type cache, - const value_idx &cache_size) { + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { return find_type(cache, cache_size, -1, 0); } - __device__ inline value_t find(find_type cache, const value_idx &key) { + __device__ inline value_t find(find_type cache, const value_idx& key) + { auto a_pair = cache.find(key); value_t a_col = 0.0; - if (a_pair != cache.end()) { - a_col = a_pair->second; - } + if (a_pair != cache.end()) { a_col = a_pair->second; } return a_col; } struct fits_in_hash_table { public: - fits_in_hash_table(const value_idx *indptr_, value_idx degree_l_, - value_idx degree_r_) - : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) {} + fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_) + : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) + { + } - __host__ __device__ bool operator()(const value_idx &i) { + __host__ __device__ bool operator()(const value_idx& i) + { auto degree = indptr[i + 1] - indptr[i]; return degree >= degree_l && degree < degree_r; } private: - const value_idx *indptr; + const value_idx* indptr; const value_idx degree_l, degree_r; }; - inline static int get_map_size() { - return (raft::getSharedMemPerBlock() - - ((tpb / raft::warp_size()) * sizeof(value_t))) / + inline static int get_map_size() + { + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(typename insert_type::slot_type); } diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh index 51f9a05394..b12252ab25 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh @@ -27,68 +27,88 @@ namespace sparse { namespace distance { /** - * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with - * sparse-matrix-sparse-vector multiplication layout (SPMV). - * This is intended to be scheduled n_chunks_b times for each row of a. - * The steps are as follows: - * - * 1. Load row from A into dense vector in shared memory. - * This can be further chunked in the future if necessary to support larger - * column sizes. - * 2. Threads of block all step through chunks of B in parallel. - * When a new row is encountered in row_indices_b, a segmented - * reduction is performed across the warps and then across the - * block and the final value written out to host memory. - * - * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf - * - * @tparam value_idx index type - * @tparam value_t value type - * @tparam tpb threads per block configured on launch - * @tparam rev if this is true, the reduce/accumulate functions are only - * executed when A[col] == 0.0. when executed before/after !rev - * and A & B are reversed, this allows the full symmetric difference - * and intersection to be computed. - * @tparam kv_t data type stored in shared mem cache - * @tparam product_f reduce function type (semiring product() function). - * accepts two arguments of value_t and returns a value_t - * @tparam accum_f accumulation function type (semiring sum() function). - * accepts two arguments of value_t and returns a value_t - * @tparam write_f function to write value out. this should be mathematically - * equivalent to the accumulate function but implemented as - * an atomic operation on global memory. Accepts two arguments - * of value_t* and value_t and updates the value given by the - * pointer. - * @param[in] indptrA column pointer array for A - * @param[in] indicesA column indices array for A - * @param[in] dataA data array for A - * @param[in] rowsB coo row array for B - * @param[in] indicesB column indices array for B - * @param[in] dataB data array for B - * @param[in] m number of rows in A - * @param[in] n number of rows in B - * @param[in] dim number of features - * @param[in] nnz_b number of nonzeros in B - * @param[out] out array of size m*n - * @param[in] n_blocks_per_row number of blocks of B per row of A - * @param[in] chunk_size number of nnz for B to use for each row of A - * @param[in] buffer_size amount of smem to use for each row of A - * @param[in] product_func semiring product() function - * @param[in] accum_func semiring sum() function - * @param[in] write_func atomic semiring sum() function - */ -template -__global__ void balanced_coo_generalized_spmv_kernel( - strategy_t strategy, indptr_it indptrA, value_idx *indicesA, value_t *dataA, - value_idx nnz_a, value_idx *rowsB, value_idx *indicesB, value_t *dataB, - value_idx m, value_idx n, int dim, value_idx nnz_b, value_t *out, - int n_blocks_per_row, int chunk_size, value_idx b_ncols, - product_f product_func, accum_f accum_func, write_f write_func) { + * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with + * sparse-matrix-sparse-vector multiplication layout (SPMV). + * This is intended to be scheduled n_chunks_b times for each row of a. + * The steps are as follows: + * + * 1. Load row from A into dense vector in shared memory. + * This can be further chunked in the future if necessary to support larger + * column sizes. + * 2. Threads of block all step through chunks of B in parallel. + * When a new row is encountered in row_indices_b, a segmented + * reduction is performed across the warps and then across the + * block and the final value written out to host memory. + * + * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam tpb threads per block configured on launch + * @tparam rev if this is true, the reduce/accumulate functions are only + * executed when A[col] == 0.0. when executed before/after !rev + * and A & B are reversed, this allows the full symmetric difference + * and intersection to be computed. + * @tparam kv_t data type stored in shared mem cache + * @tparam product_f reduce function type (semiring product() function). + * accepts two arguments of value_t and returns a value_t + * @tparam accum_f accumulation function type (semiring sum() function). + * accepts two arguments of value_t and returns a value_t + * @tparam write_f function to write value out. this should be mathematically + * equivalent to the accumulate function but implemented as + * an atomic operation on global memory. Accepts two arguments + * of value_t* and value_t and updates the value given by the + * pointer. + * @param[in] indptrA column pointer array for A + * @param[in] indicesA column indices array for A + * @param[in] dataA data array for A + * @param[in] rowsB coo row array for B + * @param[in] indicesB column indices array for B + * @param[in] dataB data array for B + * @param[in] m number of rows in A + * @param[in] n number of rows in B + * @param[in] dim number of features + * @param[in] nnz_b number of nonzeros in B + * @param[out] out array of size m*n + * @param[in] n_blocks_per_row number of blocks of B per row of A + * @param[in] chunk_size number of nnz for B to use for each row of A + * @param[in] buffer_size amount of smem to use for each row of A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, + indptr_it indptrA, + value_idx* indicesA, + value_t* dataA, + value_idx nnz_a, + value_idx* rowsB, + value_idx* indicesB, + value_t* dataB, + value_idx m, + value_idx n, + int dim, + value_idx nnz_b, + value_t* out, + int n_blocks_per_row, + int chunk_size, + value_idx b_ncols, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ typedef cub::WarpReduce warp_reduce; - value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); + value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row; // chunk starting offset @@ -96,18 +116,17 @@ __global__ void balanced_coo_generalized_spmv_kernel( // how many total cols will be processed by this block (should be <= chunk_size * n_threads) value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset); - int tid = threadIdx.x; + int tid = threadIdx.x; int warp_id = tid / raft::warp_size(); // compute id relative to current warp unsigned int lane_id = tid & (raft::warp_size() - 1); - value_idx ind = ind_offset + threadIdx.x; + value_idx ind = ind_offset + threadIdx.x; extern __shared__ char smem[]; - typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); - typename warp_reduce::TempStorage *temp_storage = - (typename warp_reduce::TempStorage *)(A + dim); + typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); + typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim); auto inserter = strategy.init_insert(A, dim); @@ -115,13 +134,12 @@ __global__ void balanced_coo_generalized_spmv_kernel( value_idx start_offset_a, stop_offset_a; bool first_a_chunk, last_a_chunk; - indptrA.get_row_offsets(cur_row_a, start_offset_a, stop_offset_a, - n_blocks_per_row, first_a_chunk, last_a_chunk); + indptrA.get_row_offsets( + cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk); // Convert current row vector in A to dense for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) { - strategy.insert(inserter, indicesA[start_offset_a + i], - dataA[start_offset_a + i]); + strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]); } __syncthreads(); @@ -132,34 +150,36 @@ __global__ void balanced_coo_generalized_spmv_kernel( if (ind >= nnz_b) return; value_idx start_index_a = 0, stop_index_a = b_ncols - 1; - indptrA.get_indices_boundary(indicesA, cur_row_a, start_offset_a, - stop_offset_a, start_index_a, stop_index_a, - first_a_chunk, last_a_chunk); + indptrA.get_indices_boundary(indicesA, + cur_row_a, + start_offset_a, + stop_offset_a, + start_index_a, + stop_index_a, + first_a_chunk, + last_a_chunk); value_idx cur_row_b = -1; - value_t c = 0.0; + value_t c = 0.0; auto warp_red = warp_reduce(*(temp_storage + warp_id)); if (tid < active_chunk_size) { cur_row_b = rowsB[ind]; - auto index_b = indicesB[ind]; - auto in_bounds = - indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { - c = product_func(a_col, dataB[ind]); - } + if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); } } } // loop through chunks in parallel, reducing when a new row is // encountered by each thread for (int i = tid; i < active_chunk_size; i += blockDim.x) { - value_idx ind_next = ind + blockDim.x; + value_idx ind_next = ind + blockDim.x; value_idx next_row_b = -1; if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next]; @@ -170,14 +190,13 @@ __global__ void balanced_coo_generalized_spmv_kernel( // grab the threads currently participating in loops. // because any other threads should have returned already. unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b); - bool is_leader = get_lowest_peer(peer_group) == lane_id; - value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); + bool is_leader = get_lowest_peer(peer_group) == lane_id; + value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); // thread with lowest lane id among peers writes out if (is_leader && v != 0.0) { // this conditional should be uniform, since rev is constant - size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b - : (size_t)cur_row_b * m + cur_row_a; + size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a; write_func(out + idx, v); } @@ -187,15 +206,12 @@ __global__ void balanced_coo_generalized_spmv_kernel( if (next_row_b != -1) { ind = ind_next; - auto index_b = indicesB[ind]; - auto in_bounds = - indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { - c = accum_func(c, product_func(a_col, dataB[ind])); - } + if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); } } cur_row_b = next_row_b; diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh index a1974b3666..228a62ed7a 100644 --- a/cpp/include/raft/sparse/distance/distance.cuh +++ b/cpp/include/raft/sparse/distance/distance.cuh @@ -74,16 +74,17 @@ static const std::unordered_set supportedDistance{ * @param[in] metric distance metric to use */ template -void pairwiseDistance(value_t *out, +void pairwiseDistance(value_t* out, distances_config_t input_config, - raft::distance::DistanceType metric, float metric_arg) { + raft::distance::DistanceType metric, + float metric_arg) +{ switch (metric) { case raft::distance::DistanceType::L2Expanded: l2_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtExpanded: - l2_sqrt_expanded_distances_t(input_config) - .compute(out); + l2_sqrt_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::InnerProduct: ip_distances_t(input_config).compute(out); @@ -92,62 +93,49 @@ void pairwiseDistance(value_t *out, l2_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - l2_sqrt_unexpanded_distances_t(input_config) - .compute(out); + l2_sqrt_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L1: l1_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::LpUnexpanded: - lp_unexpanded_distances_t(input_config, metric_arg) - .compute(out); + lp_unexpanded_distances_t(input_config, metric_arg).compute(out); break; case raft::distance::DistanceType::Linf: - linf_unexpanded_distances_t(input_config) - .compute(out); + linf_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::Canberra: - canberra_unexpanded_distances_t(input_config) - .compute(out); + canberra_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::JaccardExpanded: - jaccard_expanded_distances_t(input_config) - .compute(out); + jaccard_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::CosineExpanded: - cosine_expanded_distances_t(input_config) - .compute(out); + cosine_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::HellingerExpanded: - hellinger_expanded_distances_t(input_config) - .compute(out); + hellinger_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::DiceExpanded: dice_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::CorrelationExpanded: - correlation_expanded_distances_t(input_config) - .compute(out); + correlation_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::RusselRaoExpanded: - russelrao_expanded_distances_t(input_config) - .compute(out); + russelrao_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::HammingUnexpanded: - hamming_unexpanded_distances_t(input_config) - .compute(out); + hamming_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::JensenShannon: - jensen_shannon_unexpanded_distances_t(input_config) - .compute(out); + jensen_shannon_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::KLDivergence: - kl_divergence_unexpanded_distances_t(input_config) - .compute(out); + kl_divergence_unexpanded_distances_t(input_config).compute(out); break; - default: - THROW("Unsupported distance: %d", metric); + default: THROW("Unsupported distance: %d", metric); } } diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh index 882ccba027..8d77f9f5b5 100644 --- a/cpp/include/raft/sparse/distance/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/ip_distance.cuh @@ -45,10 +45,13 @@ class ip_distances_t : public distances_t { * Computes simple sparse inner product distances as sum(x_y * y_k) * @param[in] config specifies inputs, outputs, and sizes */ - ip_distances_t(const distances_config_t &config) - : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) { - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows_b.data(), config_->b_nnz, + ip_distances_t(const distances_config_t& config) + : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) + { + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows_b.data(), + config_->b_nnz, config_->handle.get_stream()); } @@ -56,21 +59,21 @@ class ip_distances_t : public distances_t { * Performs pairwise distance computation and computes output distances * @param out_distances dense output matrix (size a_nrows * b_nrows) */ - void compute(value_t *out_distances) { + void compute(value_t* out_distances) + { /** - * Compute pairwise distances and return dense matrix in row-major format - */ + * Compute pairwise distances and return dense matrix in row-major format + */ balanced_coo_pairwise_generalized_spmv( - out_distances, *config_, coo_rows_b.data(), Product(), Sum(), - AtomicAdd()); + out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd()); } - value_idx *b_rows_coo() { return coo_rows_b.data(); } + value_idx* b_rows_coo() { return coo_rows_b.data(); } - value_t *b_data_coo() { return config_->b_data; } + value_t* b_data_coo() { return config_->b_data; } private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector coo_rows_b; }; }; // END namespace distance diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh index 8886d4c9df..a9a2d1ee91 100644 --- a/cpp/include/raft/sparse/distance/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/l2_distance.cuh @@ -41,35 +41,36 @@ namespace distance { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_row_norm_kernel(value_t *out, - const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, - value_idx nnz) { +__global__ void compute_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { - atomicAdd(&out[coo_rows[i]], data[i] * data[i]); - } + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); } } template -__global__ void compute_row_sum_kernel(value_t *out, - const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, - value_idx nnz) { +__global__ void compute_row_sum_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { - atomicAdd(&out[coo_rows[i]], data[i]); - } + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); } } template -__global__ void compute_euclidean_warp_kernel( - value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, - const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols, - expansion_f expansion_func) { +__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; @@ -83,25 +84,29 @@ __global__ void compute_euclidean_warp_kernel( } template -__global__ void compute_correlation_warp_kernel( - value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, - const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms, - const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols, - value_idx n) { +__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t dot = C[(size_t)i * n_cols + j]; + value_t dot = C[(size_t)i * n_cols + j]; value_t Q_l1 = Q_norms[i]; value_t R_l1 = R_norms[j]; value_t Q_l2 = Q_sq_norms[i]; value_t R_l2 = R_sq_norms[j]; - value_t numer = n * dot - (Q_l1 * R_l1); + value_t numer = n * dot - (Q_l1 * R_l1); value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1); value_t R_denom = n * R_l2 - (R_l1 * R_l1); @@ -111,58 +116,77 @@ __global__ void compute_correlation_warp_kernel( C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); } -template -void compute_euclidean(value_t *C, const value_t *Q_sq_norms, - const value_t *R_sq_norms, value_idx n_rows, - value_idx n_cols, cudaStream_t stream, - expansion_f expansion_func) { +template +void compute_euclidean(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + value_idx n_rows, + value_idx n_cols, + cudaStream_t stream, + expansion_f expansion_func) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_euclidean_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func); } -template -void compute_l2(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, +template +void compute_l2(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, std::shared_ptr alloc, - cudaStream_t stream, expansion_f expansion_func) { + cudaStream_t stream, + expansion_f expansion_func) +{ rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_row_norm_kernel<<>>( R_sq_norms.data(), R_coo_rows, R_data, R_nnz); - compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, - expansion_func); + compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func); } template -void compute_correlation(value_t *C, const value_t *Q_sq_norms, - const value_t *R_sq_norms, const value_t *Q_norms, - const value_t *R_norms, value_idx n_rows, - value_idx n_cols, value_idx n, cudaStream_t stream) { +void compute_correlation(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n, + cudaStream_t stream) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_correlation_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n); } template -void compute_corr(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols, +void compute_corr(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + value_idx n_cols, std::shared_ptr alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ // sum_sq for std dev rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); @@ -171,15 +195,11 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); @@ -191,8 +211,15 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, compute_row_sum_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_correlation(out, Q_sq_norms.data(), R_sq_norms.data(), Q_norms.data(), - R_norms.data(), m, n, n_cols, stream); + compute_correlation(out, + Q_sq_norms.data(), + R_sq_norms.data(), + Q_norms.data(), + R_norms.data(), + m, + n, + n_cols, + stream); } /** @@ -202,35 +229,45 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, template class l2_expanded_distances_t : public distances_t { public: - explicit l2_expanded_distances_t( - const distances_config_t &config) - : config_(&config), ip_dists(config) {} + explicit l2_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_l2( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - return -2 * dot + q_norm + r_norm; - }); + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_device_allocator(), + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + return -2 * dot + q_norm + r_norm; + }); } ~l2_expanded_distances_t() = default; protected: - const distances_config_t *config_; + const distances_config_t* config_; ip_distances_t ip_dists; }; @@ -239,18 +276,21 @@ class l2_expanded_distances_t : public distances_t { * The expanded form is more efficient for sparse data. */ template -class l2_sqrt_expanded_distances_t - : public l2_expanded_distances_t { +class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t { public: - explicit l2_sqrt_expanded_distances_t( - const distances_config_t &config) - : l2_expanded_distances_t(config) {} + explicit l2_sqrt_expanded_distances_t(const distances_config_t& config) + : l2_expanded_distances_t(config) + { + } - void compute(value_t *out_dists) override { + void compute(value_t* out_dists) override + { l2_expanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -264,25 +304,35 @@ class l2_sqrt_expanded_distances_t template class correlation_expanded_distances_t : public distances_t { public: - explicit correlation_expanded_distances_t( - const distances_config_t &config) - : config_(&config), ip_dists(config) {} + explicit correlation_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_corr(out_dists, search_coo_rows.data(), config_->a_data, - config_->a_nnz, b_indices, b_data, config_->b_nnz, - config_->a_nrows, config_->b_nrows, config_->b_ncols, + compute_corr(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->b_ncols, config_->handle.get_device_allocator(), config_->handle.get_stream()); } @@ -290,54 +340,62 @@ class correlation_expanded_distances_t : public distances_t { ~correlation_expanded_distances_t() = default; protected: - const distances_config_t *config_; + const distances_config_t* config_; ip_distances_t ip_dists; }; /** - * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2))) - * The expanded form is more efficient for sparse data. + * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * + * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data. */ template class cosine_expanded_distances_t : public distances_t { public: - explicit cosine_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit cosine_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_l2( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_device_allocator(), config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t norms = sqrt(q_norm) * sqrt(r_norm); - // deal with potential for 0 in denominator by forcing 0/1 instead - value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); - - // flip the similarity when both rows are 0 - bool both_empty = (q_norm == 0) && (r_norm == 0); - return 1 - ((!both_empty * cos) + both_empty); - }); + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_device_allocator(), + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t norms = sqrt(q_norm) * sqrt(r_norm); + // deal with potential for 0 in denominator by forcing 0/1 instead + value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); + + // flip the similarity when both rows are 0 + bool both_empty = (q_norm == 0) && (r_norm == 0); + return 1 - ((!both_empty * cos) + both_empty); + }); } ~cosine_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -354,25 +412,34 @@ class cosine_expanded_distances_t : public distances_t { template class hellinger_expanded_distances_t : public distances_t { public: - explicit hellinger_expanded_distances_t( - const distances_config_t &config) - : config_(&config), workspace(0, config.handle.get_stream()) {} + explicit hellinger_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, *config_, coo_rows.data(), - [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, Sum(), + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, + Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative bool rectifier = (1 - input) > 0; @@ -384,42 +451,43 @@ class hellinger_expanded_distances_t : public distances_t { ~hellinger_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; }; template class russelrao_expanded_distances_t : public distances_t { public: - explicit russelrao_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit russelrao_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_t n_cols = config_->a_ncols; + value_t n_cols = config_->a_ncols; value_t n_cols_inv = 1.0 / n_cols; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; }, config_->handle.get_stream()); - auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); - auto diags = thrust::counting_iterator(0); + auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); + auto diags = thrust::counting_iterator(0); value_idx b_nrows = config_->b_nrows; - thrust::for_each(exec_policy, diags, diags + config_->a_nrows, - [=] __device__(value_idx input) { - out_dists[input * b_nrows + input] = 0.0; - }); + thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) { + out_dists[input * b_nrows + input] = 0.0; + }); } ~russelrao_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh index 885d55ee50..7f9511ff03 100644 --- a/cpp/include/raft/sparse/distance/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/lp_distance.cuh @@ -38,23 +38,33 @@ namespace raft { namespace sparse { namespace distance { -template -void unexpanded_lp_distances( - value_t *out_dists, const distances_config_t *config_, - product_f product_func, accum_f accum_func, write_f write_func) { +template +void unexpanded_lp_distances(value_t* out_dists, + const distances_config_t* config_, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - coo_rows.data(), config_->a_nnz, + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv_rev( @@ -71,48 +81,51 @@ void unexpanded_lp_distances( template class l1_unexpanded_distances_t : public distances_t { public: - l1_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + l1_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), Sum(), AtomicAdd()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class l2_unexpanded_distances_t : public distances_t { public: - l2_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + l2_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, SqDiff(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, SqDiff(), Sum(), AtomicAdd()); } protected: - const distances_config_t *config_; + const distances_config_t* config_; }; template -class l2_sqrt_unexpanded_distances_t - : public l2_unexpanded_distances_t { +class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t { public: - l2_sqrt_unexpanded_distances_t( - const distances_config_t &config) - : l2_unexpanded_distances_t(config) {} + l2_sqrt_unexpanded_distances_t(const distances_config_t& config) + : l2_unexpanded_distances_t(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { l2_unexpanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -124,29 +137,33 @@ class l2_sqrt_unexpanded_distances_t template class linf_unexpanded_distances_t : public distances_t { public: - explicit linf_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit linf_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), - Max(), AtomicMax()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), Max(), AtomicMax()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class canberra_unexpanded_distances_t : public distances_t { public: - explicit canberra_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit canberra_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { unexpanded_lp_distances( - out_dists, config_, + out_dists, + config_, [] __device__(value_t a, value_t b) { value_t d = fabs(a) + fabs(b); @@ -154,70 +171,82 @@ class canberra_unexpanded_distances_t : public distances_t { // forcing 1/0 instead return ((d != 0) * fabs(a - b)) / (d + (d == 0)); }, - Sum(), AtomicAdd()); + Sum(), + AtomicAdd()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class lp_unexpanded_distances_t : public distances_t { public: - explicit lp_unexpanded_distances_t( - const distances_config_t &config, value_t p_) - : config_(&config), p(p_) {} + explicit lp_unexpanded_distances_t(const distances_config_t& config, + value_t p_) + : config_(&config), p(p_) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, PDiff(p), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, PDiff(p), Sum(), AtomicAdd()); float one_over_p = 1.0f / p; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return pow(input, one_over_p); }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; value_t p; }; template class hamming_unexpanded_distances_t : public distances_t { public: - explicit hamming_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit hamming_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, NotEqual(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, NotEqual(), Sum(), AtomicAdd()); value_t n_cols = 1.0 / config_->a_ncols; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return input * n_cols; }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class jensen_shannon_unexpanded_distances_t : public distances_t { public: explicit jensen_shannon_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { unexpanded_lp_distances( - out_dists, config_, + out_dists, + config_, [] __device__(value_t a, value_t b) { - value_t m = 0.5f * (a + b); + value_t m = 0.5f * (a + b); bool a_zero = a == 0; bool b_zero = b == 0; @@ -227,49 +256,61 @@ class jensen_shannon_unexpanded_distances_t : public distances_t { bool x_zero = x == 0; bool y_zero = y == 0; - return (-a * (!x_zero * log(x + x_zero))) + - (-b * (!y_zero * log(y + y_zero))); + return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero))); }, - Sum(), AtomicAdd()); + Sum(), + AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return sqrt(0.5 * input); }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class kl_divergence_unexpanded_distances_t : public distances_t { public: explicit kl_divergence_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, *config_, coo_rows.data(), - [] __device__(value_t a, value_t b) { return a * log(a / b); }, Sum(), + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return a * log(a / b); }, + Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return 0.5 * input; }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; }; // END namespace distance diff --git a/cpp/include/raft/sparse/distance/operators.cuh b/cpp/include/raft/sparse/distance/operators.cuh index 89acda8b1a..3a9d0ba879 100644 --- a/cpp/include/raft/sparse/distance/operators.cuh +++ b/cpp/include/raft/sparse/distance/operators.cuh @@ -24,21 +24,24 @@ namespace distance { struct Sum { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a + b; } }; struct NotEqual { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a != b; } }; struct SqDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return (a - b) * (a - b); } }; @@ -49,44 +52,48 @@ struct PDiff { PDiff(float p_) : p(p_) {} template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return pow(a - b, p); } }; struct Max { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return fmax(a, b); } }; struct AtomicAdd { template - __host__ __device__ __forceinline__ value_t operator()(value_t *a, - value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) + { return atomicAdd(a, b); } }; struct AtomicMax { template - __host__ __device__ __forceinline__ value_t operator()(value_t *a, - value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) + { return atomicMax(a, b); } }; struct Product { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a * b; } }; struct AbsDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return fabs(a - b); } }; diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh index 6b6d77a2d5..d78b927e46 100644 --- a/cpp/include/raft/sparse/distance/utils.cuh +++ b/cpp/include/raft/sparse/distance/utils.cuh @@ -34,10 +34,10 @@ namespace distance { * @return the maximum number of columns that can be stored in smem */ template -inline int max_cols_per_block() { +inline int max_cols_per_block() +{ // max cols = (total smem available - cub reduction smem) - return (raft::getSharedMemPerBlock() - - ((tpb / raft::warp_size()) * sizeof(value_t))) / + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(value_t); } diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h index 29f541498b..1738dd7498 100644 --- a/cpp/include/raft/sparse/hierarchy/common.h +++ b/cpp/include/raft/sparse/hierarchy/common.h @@ -37,13 +37,15 @@ class linkage_output { value_idx n_leaves; value_idx n_connected_components; - value_idx *labels; // size: m + value_idx* labels; // size: m - value_idx *children; // size: (m-1, 2) + value_idx* children; // size: (m-1, 2) }; -class linkage_output_int_float : public linkage_output {}; -class linkage_output__int64_float : public linkage_output {}; +class linkage_output_int_float : public linkage_output { +}; +class linkage_output__int64_float : public linkage_output { +}; }; // namespace hierarchy }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 1ac075489a..95df7f4642 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -42,31 +42,32 @@ class UnionFind { value_idx n_indices; UnionFind(value_idx N_) - : n_indices(2 * N_ - 1), - parent(2 * N_ - 1, -1), - size(2 * N_ - 1, 1), - next_label(N_) { + : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_) + { memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx)); } - value_idx find(value_idx n) { + value_idx find(value_idx n) + { value_idx p; p = n; - while (parent[n] != -1) n = parent[n]; + while (parent[n] != -1) + n = parent[n]; // path compression while (parent[p] != n) { - p = parent[p == -1 ? n_indices - 1 : p]; + p = parent[p == -1 ? n_indices - 1 : p]; parent[p == -1 ? n_indices - 1 : p] = n; } return n; } - void perform_union(value_idx m, value_idx n) { + void perform_union(value_idx m, value_idx n) + { size[next_label] = size[m] + size[n]; - parent[m] = next_label; - parent[n] = next_label; + parent[m] = next_label; + parent[n] = next_label; next_label += 1; } @@ -95,12 +96,17 @@ class UnionFind { * @param[out] out_size cluster sizes of output */ template -void build_dendrogram_host(const handle_t &handle, const value_idx *rows, - const value_idx *cols, const value_t *data, - size_t nnz, value_idx *children, value_t *out_delta, - value_idx *out_size) { +void build_dendrogram_host(const handle_t& handle, + const value_idx* rows, + const value_idx* cols, + const value_t* data, + size_t nnz, + value_idx* children, + value_t* out_delta, + value_idx* out_size) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); value_idx n_edges = nnz; @@ -121,8 +127,8 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, UnionFind U(nnz + 1); for (value_idx i = 0; i < nnz; i++) { - value_idx a = mst_src_h[i]; - value_idx b = mst_dst_h[i]; + value_idx a = mst_src_h[i]; + value_idx b = mst_dst_h[i]; value_t delta = mst_weights_h[i]; value_idx aa = U.find(a); @@ -130,10 +136,10 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, value_idx children_idx = i * 2; - children_h[children_idx] = aa; + children_h[children_idx] = aa; children_h[children_idx + 1] = bb; - out_delta_h[i] = delta; - out_size_h[i] = U.size[aa] + U.size[bb]; + out_delta_h[i] = delta; + out_size_h[i] = U.size[aa] + U.size[bb]; U.perform_union(aa, bb); } @@ -144,13 +150,15 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, } template -__global__ void write_levels_kernel(const value_idx *children, - value_idx *parents, value_idx n_vertices) { +__global__ void write_levels_kernel(const value_idx* children, + value_idx* parents, + value_idx n_vertices) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { value_idx level = tid / 2; value_idx child = children[tid]; - parents[child] = level; + parents[child] = level; } } @@ -166,14 +174,17 @@ __global__ void write_levels_kernel(const value_idx *children, * @param labels */ template -__global__ void inherit_labels(const value_idx *children, - const value_idx *levels, size_t n_leaves, - value_idx *labels, int cut_level, - value_idx n_vertices) { +__global__ void inherit_labels(const value_idx* children, + const value_idx* levels, + size_t n_leaves, + value_idx* labels, + int cut_level, + value_idx n_vertices) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { - value_idx node = children[tid]; + value_idx node = children[tid]; value_idx cur_level = tid / 2; /** @@ -183,12 +194,12 @@ __global__ void inherit_labels(const value_idx *children, if (cur_level > cut_level) return; value_idx cur_parent = node; - value_idx label = labels[cur_parent]; + value_idx label = labels[cur_parent]; while (label == -1) { cur_parent = cur_level + n_leaves; - cur_level = levels[cur_parent]; - label = labels[cur_parent]; + cur_level = levels[cur_parent]; + label = labels[cur_parent]; } labels[node] = label; @@ -197,15 +208,16 @@ __global__ void inherit_labels(const value_idx *children, template struct init_label_roots { - init_label_roots(value_idx *labels_) : labels(labels_) {} + init_label_roots(value_idx* labels_) : labels(labels_) {} template - __host__ __device__ void operator()(Tuple t) { + __host__ __device__ void operator()(Tuple t) + { labels[thrust::get<1>(t)] = thrust::get<0>(t); } private: - value_idx *labels; + value_idx* labels; }; /** @@ -221,11 +233,14 @@ struct init_label_roots { * @param n_leaves */ template -void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, - const value_idx *children, size_t n_clusters, - size_t n_leaves) { - auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); +void extract_flattened_clusters(const raft::handle_t& handle, + value_idx* labels, + const value_idx* children, + size_t n_clusters, + size_t n_leaves) +{ + auto d_alloc = handle.get_device_allocator(); + auto stream = handle.get_stream(); auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); // Handle special case where n_clusters == 1 @@ -243,10 +258,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, size_t n_edges = (n_leaves - 1) * 2; - thrust::device_ptr d_ptr = - thrust::device_pointer_cast(children); - value_idx n_vertices = - *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; + thrust::device_ptr d_ptr = thrust::device_pointer_cast(children); + value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; // Prevent potential infinite loop from labeling disconnected // connectivities graph. @@ -257,8 +270,7 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, rmm::device_uvector levels(n_vertices, stream); value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb); - write_levels_kernel<<>>(children, levels.data(), - n_vertices); + write_levels_kernel<<>>(children, levels.data(), n_vertices); /** * Step 1: Find label roots: * @@ -272,27 +284,26 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, rmm::device_uvector label_roots(child_size, stream); value_idx children_cpy_start = n_edges - child_size; - raft::copy_async(label_roots.data(), children + children_cpy_start, - child_size, stream); + raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream); - thrust::sort(thrust_policy, label_roots.data(), + thrust::sort(thrust_policy, + label_roots.data(), label_roots.data() + (child_size), thrust::greater()); rmm::device_uvector tmp_labels(n_vertices, stream); // Init labels to -1 - thrust::fill(thrust_policy, tmp_labels.data(), - tmp_labels.data() + n_vertices, -1); + thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1); // Write labels for cluster roots to "labels" thrust::counting_iterator first(0); - auto z_iter = thrust::make_zip_iterator(thrust::make_tuple( - first, label_roots.data() + (label_roots.size() - n_clusters))); + auto z_iter = thrust::make_zip_iterator( + thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters))); - thrust::for_each(thrust_policy, z_iter, z_iter + n_clusters, - init_label_roots(tmp_labels.data())); + thrust::for_each( + thrust_policy, z_iter, z_iter + n_clusters, init_label_roots(tmp_labels.data())); /** * Step 2: Propagate labels by having children iterate through their parents @@ -302,9 +313,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, */ value_idx cut_level = (n_edges / 2) - (n_clusters - 1); - inherit_labels<<>>(children, levels.data(), - n_leaves, tmp_labels.data(), - cut_level, n_vertices); + inherit_labels<<>>( + children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices); // copy tmp labels to actual labels raft::copy_async(labels, tmp_labels.data(), n_leaves, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 7cf959dda6..096f1c650f 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -37,14 +37,17 @@ namespace raft { namespace hierarchy { namespace detail { -template +template struct distance_graph_impl { - void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, + void run(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c); + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c); }; /** @@ -53,50 +56,51 @@ struct distance_graph_impl { * @tparam value_t */ template -struct distance_graph_impl { - void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, +struct distance_graph_impl { + void run(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c) { - auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c) + { + auto d_alloc = handle.get_device_allocator(); + auto stream = handle.get_stream(); auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); // Need to symmetrize knn into undirected graph raft::sparse::COO knn_graph_coo(d_alloc, stream); - raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, - c); + raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); indices.resize(knn_graph_coo.nnz, stream); data.resize(knn_graph_coo.nnz, stream); // self-loops get max distance - auto transform_in = thrust::make_zip_iterator(thrust::make_tuple( - knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); - - thrust::transform( - exec_policy, transform_in, transform_in + knn_graph_coo.nnz, - knn_graph_coo.vals(), - [=] __device__(const thrust::tuple &tup) { - bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); - return (self_loop * std::numeric_limits::max()) + - (!self_loop * thrust::get<2>(tup)); - }); - - raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), - knn_graph_coo.nnz, indptr.data(), - m + 1, d_alloc, stream); + auto transform_in = thrust::make_zip_iterator( + thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); + + thrust::transform(exec_policy, + transform_in, + transform_in + knn_graph_coo.nnz, + knn_graph_coo.vals(), + [=] __device__(const thrust::tuple& tup) { + bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); + return (self_loop * std::numeric_limits::max()) + + (!self_loop * thrust::get<2>(tup)); + }); + + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, d_alloc, stream); // TODO: Wouldn't need to copy here if we could compute knn // graph directly on the device uvectors // ref: https://github.com/rapidsai/raft/issues/227 - raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, - stream); - raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, - stream); + raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, stream); + raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, stream); } }; @@ -116,13 +120,17 @@ struct distance_graph_impl -void get_distance_graph(const raft::handle_t &handle, const value_t *X, - size_t m, size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c) { +template +void get_distance_graph(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c) +{ auto stream = handle.get_stream(); indptr.resize(m + 1, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 765a5ad77f..f939e87484 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -37,9 +37,10 @@ namespace hierarchy { namespace detail { template -void merge_msts(raft::Graph_COO &coo1, - raft::Graph_COO &coo2, - cudaStream_t stream) { +void merge_msts(raft::Graph_COO& coo1, + raft::Graph_COO& coo2, + cudaStream_t stream) +{ /** Add edges to existing mst **/ int final_nnz = coo2.n_edges + coo1.n_edges; @@ -50,12 +51,9 @@ void merge_msts(raft::Graph_COO &coo1, /** * Construct final edge list */ - raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), - coo2.n_edges, stream); - raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), - coo2.n_edges, stream); - raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), - coo2.n_edges, stream); + raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream); + raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream); + raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream); coo1.n_edges = final_nnz; } @@ -74,14 +72,18 @@ void merge_msts(raft::Graph_COO &coo1, * @return updated MST edge list */ template -void connect_knn_graph(const raft::handle_t &handle, const value_t *X, - raft::Graph_COO &msf, - size_t m, size_t n, value_idx *color, - red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded) { +void connect_knn_graph( + const raft::handle_t& handle, + const value_t* X, + raft::Graph_COO& msf, + size_t m, + size_t n, + value_idx* color, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); raft::sparse::COO connected_edges(d_alloc, stream); @@ -89,15 +91,21 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, handle, connected_edges, X, color, m, n, reduction_op); rmm::device_uvector indptr2(m + 1, stream); - raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(), - connected_edges.nnz, indptr2.data(), - m + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, d_alloc, stream); // On the second call, we hand the MST the original colors // and the new set of edges and let it restart the optimization process - auto new_mst = raft::mst::mst( - handle, indptr2.data(), connected_edges.cols(), connected_edges.vals(), m, - connected_edges.nnz, color, stream, false, false); + auto new_mst = raft::mst::mst(handle, + indptr2.data(), + connected_edges.cols(), + connected_edges.vals(), + m, + connected_edges.nnz, + color, + stream, + false, + false); merge_msts(msf, new_mst, stream); } @@ -127,29 +135,35 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, * argument is really just a safeguard against the potential for infinite loops. */ template -void build_sorted_mst(const raft::handle_t &handle, const value_t *X, - const value_idx *indptr, const value_idx *indices, - const value_t *pw_dists, size_t m, size_t n, - value_idx *mst_src, value_idx *mst_dst, - value_t *mst_weight, value_idx *color, size_t nnz, - red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded, - int max_iter = 10) { +void build_sorted_mst( + const raft::handle_t& handle, + const value_t* X, + const value_idx* indptr, + const value_idx* indices, + const value_t* pw_dists, + size_t m, + size_t n, + value_idx* mst_src, + value_idx* mst_dst, + value_t* mst_weight, + value_idx* color, + size_t nnz, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded, + int max_iter = 10) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // We want to have MST initialize colors on first call. auto mst_coo = raft::mst::mst( - handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, - true); + handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true); - int iters = 1; + int iters = 1; int n_components = linkage::get_n_components(color, m, d_alloc, stream); while (n_components > 1 && iters < max_iter) { - connect_knn_graph(handle, X, mst_coo, m, n, color, - reduction_op); + connect_knn_graph(handle, X, mst_coo, m, n, color, reduction_op); iters++; @@ -176,9 +190,8 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, " or increase 'max_iter'", max_iter); - raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(), - mst_coo.weights.data(), mst_coo.n_edges, - stream); + raft::sparse::op::coo_sort_by_weight( + mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream); raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream); raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream); diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp index 01a033945c..fe9538120f 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp @@ -44,20 +44,26 @@ static const size_t EMPTY = 0; * @param[in] n number of columns in X * @param[in] metric distance metrix to use when constructing connectivities graph * @param[out] out struct containing output dendrogram and cluster assignments - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control + * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect + control * of k. The algorithm will set `k = log(n) + c` * @param[in] n_clusters number of clusters to assign data samples */ -template -void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, - size_t n, raft::distance::DistanceType metric, - linkage_output *out, int c, - size_t n_clusters) { - ASSERT(n_clusters <= m, - "n_clusters must be less than or equal to the number of data points"); - - auto stream = handle.get_stream(); +void single_linkage(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + linkage_output* out, + int c, + size_t n_clusters) +{ + ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points"); + + auto stream = handle.get_stream(); auto d_alloc = handle.get_device_allocator(); rmm::device_uvector indptr(EMPTY, stream); @@ -79,10 +85,20 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, */ rmm::device_uvector color(m, stream); raft::linkage::FixConnectivitiesRedOp op(color.data(), m); - detail::build_sorted_mst( - handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n, - mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(), - indices.size(), op, metric); + detail::build_sorted_mst(handle, + X, + indptr.data(), + indices.data(), + pw_dists.data(), + m, + n, + mst_rows.data(), + mst_cols.data(), + mst_data.data(), + color.data(), + indices.size(), + op, + metric); pw_dists.release(); @@ -94,15 +110,19 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, rmm::device_uvector out_delta(n_edges, stream); rmm::device_uvector out_size(n_edges, stream); // Create dendrogram - detail::build_dendrogram_host( - handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges, - out->children, out_delta.data(), out_size.data()); - detail::extract_flattened_clusters(handle, out->labels, out->children, - n_clusters, m); - - out->m = m; - out->n_clusters = n_clusters; - out->n_leaves = m; + detail::build_dendrogram_host(handle, + mst_rows.data(), + mst_cols.data(), + mst_data.data(), + n_edges, + out->children, + out_delta.data(), + out_size.data()); + detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m); + + out->m = m; + out->n_clusters = n_clusters; + out->n_leaves = m; out->n_connected_components = 1; } diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 47b1ba6e41..01735a102d 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -40,40 +40,47 @@ namespace sparse { namespace linalg { template -__global__ void csr_add_calc_row_counts_kernel( - const int *a_ind, const int *a_indptr, const T *a_val, int nnz1, - const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m, - int *out_rowcounts) { +__global__ void csr_add_calc_row_counts_kernel(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_rowcounts) +{ // loop through columns in each set of rows and // calculate number of unique cols across both rows int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { int a_start_idx = a_ind[row]; - int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); + int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); /** - * Union of columns within each row of A and B so that we can scan through - * them, adding their values together. - */ + * Union of columns within each row of A and B so that we can scan through + * them, adding their values together. + */ int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx); - int *arr = new int[max_size]; + int* arr = new int[max_size]; int cur_arr_idx = 0; for (int j = a_start_idx; j < a_stop_idx; j++) { arr[cur_arr_idx] = a_indptr[j]; cur_arr_idx++; } - int arr_size = cur_arr_idx; + int arr_size = cur_arr_idx; int final_size = arr_size; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = 0; k < arr_size; k++) { if (arr[k] == cur_col) { found = true; @@ -81,9 +88,7 @@ __global__ void csr_add_calc_row_counts_kernel( } } - if (!found) { - final_size++; - } + if (!found) { final_size++; } } out_rowcounts[row] = final_size; @@ -94,11 +99,19 @@ __global__ void csr_add_calc_row_counts_kernel( } template -__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, - const T *a_val, int nnz1, const int *b_ind, - const int *b_indptr, const T *b_val, int nnz2, - int m, int *out_ind, int *out_indptr, - T *out_val) { +__global__ void csr_add_kernel(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_ind, + int* out_indptr, + T* out_val) +{ // 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -109,21 +122,21 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); int o_idx = out_ind[row]; int cur_o_idx = o_idx; for (int j = a_start_idx; j < a_stop_idx; j++) { out_indptr[cur_o_idx] = a_indptr[j]; - out_val[cur_o_idx] = a_val[j]; + out_val[cur_o_idx] = a_val[j]; cur_o_idx++; } int arr_size = cur_o_idx - o_idx; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = o_idx; k < o_idx + arr_size; k++) { // If we found a match, sum the two values if (out_indptr[k] == cur_col) { @@ -136,7 +149,7 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, // if we didn't find a match, add the value for b if (!found) { out_indptr[o_idx + arr_size] = cur_col; - out_val[o_idx + arr_size] = b_val[j]; + out_val[o_idx + arr_size] = b_val[j]; arr_size++; } } @@ -160,32 +173,36 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, * @param stream: cuda stream to use */ template -size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, - int nnz1, const int *b_ind, const int *b_indptr, - const T *b_val, int nnz2, int m, int *out_ind, +size_t csr_add_calc_inds(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_ind, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); raft::mr::device::buffer row_counts(d_alloc, stream, m + 1); - CUDA_CHECK( - cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); - csr_add_calc_row_counts_kernel - <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, - b_val, nnz2, m, row_counts.data()); + csr_add_calc_row_counts_kernel<<>>( + a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data()); int cnnz = 0; raft::update_host(&cnnz, row_counts.data() + m, 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = - thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, - c_ind_d); + thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); + exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d); return cnnz; } @@ -208,16 +225,25 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, * @param stream: cuda stream to use */ template -void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val, - int nnz1, const int *b_ind, const int *b_indptr, - const T *b_val, int nnz2, int m, int *c_ind, - int *c_indptr, T *c_val, cudaStream_t stream) { +void csr_add_finalize(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* c_ind, + int* c_indptr, + T* c_val, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_add_kernel - <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, - b_val, nnz2, m, c_ind, c_indptr, c_val); + csr_add_kernel<<>>( + a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh index 9bd322c90a..77a9445ab1 100644 --- a/cpp/include/raft/sparse/linalg/degree.cuh +++ b/cpp/include/raft/sparse/linalg/degree.cuh @@ -44,11 +44,10 @@ namespace linalg { * @param results array to place results */ template -__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) { +__global__ void coo_degree_kernel(const int* rows, int nnz, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz) { raft::myAtomicAdd(results + rows[row], 1); } } /** @@ -60,7 +59,8 @@ __global__ void coo_degree_kernel(const int *rows, int nnz, int *results) { * @param stream: cuda stream to use */ template -void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) { +void coo_degree(const int* rows, int nnz, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -77,31 +77,28 @@ void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) { * @param stream: cuda stream to use */ template -void coo_degree(COO *in, int *results, cudaStream_t stream) { +void coo_degree(COO* in, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_kernel - <<>>(in->rows(), in->nnz, results); + coo_degree_kernel<<>>(in->rows(), in->nnz, results); CUDA_CHECK(cudaGetLastError()); } template -__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz, - int *results) { +__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != 0.0) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); } } template -__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, - int nnz, T scalar, int *results) { +__global__ void coo_degree_scalar_kernel( + const int* rows, const T* vals, int nnz, T scalar, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != scalar) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); } } /** @@ -114,12 +111,12 @@ __global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, * @param stream: cuda stream to use */ template -void coo_degree_scalar(COO *in, T scalar, int *results, - cudaStream_t stream) { +void coo_degree_scalar(COO* in, T scalar, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_scalar_kernel<<>>( - in->rows(), in->vals(), in->nnz, scalar, results); + coo_degree_scalar_kernel + <<>>(in->rows(), in->vals(), in->nnz, scalar, results); CUDA_CHECK(cudaGetLastError()); } @@ -135,8 +132,9 @@ void coo_degree_scalar(COO *in, T scalar, int *results, * @param stream: cuda stream to use */ template -void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, - int *results, cudaStream_t stream = 0) { +void coo_degree_scalar( + const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); coo_degree_scalar_kernel @@ -154,12 +152,11 @@ void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, * @param stream: cuda stream to use */ template -void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, - cudaStream_t stream) { +void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_nz_kernel - <<>>(rows, vals, nnz, results); + coo_degree_nz_kernel<<>>(rows, vals, nnz, results); } /** @@ -171,7 +168,8 @@ void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, * @param stream: cuda stream to use */ template -void coo_degree_nz(COO *in, int *results, cudaStream_t stream) { +void coo_degree_nz(COO* in, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh index bfcd3fd592..59dc5ff3e4 100644 --- a/cpp/include/raft/sparse/linalg/norm.cuh +++ b/cpp/include/raft/sparse/linalg/norm.cuh @@ -41,10 +41,12 @@ __global__ void csr_row_normalize_l1_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int *ia, // csr row ex_scan (sorted by row) - const T *vals, int nnz, // array of values and number of non-zeros - int m, // num rows in csr - T *result) { // output array + const int* ia, // csr row ex_scan (sorted by row) + const T* vals, + int nnz, // array of values and number of non-zeros + int m, // num rows in csr + T* result) +{ // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -52,7 +54,7 @@ __global__ void csr_row_normalize_l1_kernel( // sum all vals_arr for row and divide each val by sum if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -65,7 +67,7 @@ __global__ void csr_row_normalize_l1_kernel( for (int j = start_idx; j < stop_idx; j++) { if (sum != 0.0) { - T val = vals[j]; + T val = vals[j]; result[j] = val / sum; } else { result[j] = 0.0; @@ -85,18 +87,18 @@ __global__ void csr_row_normalize_l1_kernel( * @param stream: cuda stream to use */ template -void csr_row_normalize_l1(const int *ia, // csr row ex_scan (sorted by row) - const T *vals, +void csr_row_normalize_l1(const int* ia, // csr row ex_scan (sorted by row) + const T* vals, int nnz, // array of values and number of non-zeros int m, // num rows in csr - T *result, - cudaStream_t stream) { // output array + T* result, + cudaStream_t stream) +{ // output array dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_l1_kernel - <<>>(ia, vals, nnz, m, result); + csr_row_normalize_l1_kernel<<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } @@ -105,10 +107,12 @@ __global__ void csr_row_normalize_max_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int *ia, // csr row ind array (sorted by row) - const T *vals, int nnz, // array of values and number of non-zeros - int m, // num total rows in csr - T *result) { // output array + const int* ia, // csr row ind array (sorted by row) + const T* vals, + int nnz, // array of values and number of non-zeros + int m, // num total rows in csr + T* result) +{ // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -116,7 +120,7 @@ __global__ void csr_row_normalize_max_kernel( // find max across columns and divide if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -130,7 +134,7 @@ __global__ void csr_row_normalize_max_kernel( // divide nonzeros in current row by max for (int j = start_idx; j < stop_idx; j++) { if (max != 0.0 && max > std::numeric_limits::min()) { - T val = vals[j]; + T val = vals[j]; result[j] = val / max; } else { result[j] = 0.0; @@ -151,16 +155,17 @@ __global__ void csr_row_normalize_max_kernel( */ template -void csr_row_normalize_max(const int *ia, // csr row ind array (sorted by row) - const T *vals, +void csr_row_normalize_max(const int* ia, // csr row ind array (sorted by row) + const T* vals, int nnz, // array of values and number of non-zeros int m, // num total rows in csr - T *result, cudaStream_t stream) { + T* result, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_max_kernel - <<>>(ia, vals, nnz, m, result); + csr_row_normalize_max_kernel<<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index 15302f3b74..3b609d994f 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -31,16 +31,23 @@ namespace sparse { namespace spectral { template -void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, - int nnz, int n, int n_components, T *out, - unsigned long long seed = 1234567) { - auto stream = handle.get_stream(); +void fit_embedding(const raft::handle_t& handle, + int* rows, + int* cols, + T* vals, + int nnz, + int n, + int n_components, + T* out, + unsigned long long seed = 1234567) +{ + auto stream = handle.get_stream(); auto d_alloc = handle.get_device_allocator(); raft::mr::device::buffer src_offsets(d_alloc, stream, n + 1); raft::mr::device::buffer dst_cols(d_alloc, stream, nnz); raft::mr::device::buffer dst_vals(d_alloc, stream, nnz); - convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), - dst_cols.data(), dst_vals.data()); + convert::coo_to_csr( + handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); raft::mr::device::buffer eigVals(d_alloc, stream, n_components + 1); raft::mr::device::buffer eigVecs(d_alloc, stream, n * (n_components + 1)); @@ -54,48 +61,53 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, using index_type = int; using value_type = T; - index_type *ro = src_offsets.data(); - index_type *ci = dst_cols.data(); - value_type *vs = dst_vals.data(); + index_type* ro = src_offsets.data(); + index_type* ci = dst_cols.data(); + value_type* vs = dst_vals.data(); - raft::matrix::sparse_matrix_t const r_csr_m{ - handle, ro, ci, vs, n, nnz}; + raft::matrix::sparse_matrix_t const r_csr_m{handle, ro, ci, vs, n, nnz}; - index_type neigvs = n_components + 1; - index_type maxiter = 4000; //default reset value (when set to 0); - value_type tol = 0.01; - index_type restart_iter = 15 + neigvs; //what cugraph is using - auto t_exe_p = thrust::cuda::par.on(stream); + index_type neigvs = n_components + 1; + index_type maxiter = 4000; // default reset value (when set to 0); + value_type tol = 0.01; + index_type restart_iter = 15 + neigvs; // what cugraph is using + auto t_exe_p = thrust::cuda::par.on(stream); using thrust_exe_policy_t = decltype(t_exe_p); - raft::eigen_solver_config_t cfg{neigvs, maxiter, - restart_iter, tol}; + raft::eigen_solver_config_t cfg{neigvs, maxiter, restart_iter, tol}; cfg.seed = seed; raft::lanczos_solver_t eig_solver{cfg}; - //cluster computation here is irrelevant, - //hence define a no-op such solver to - //feed partition(): + // cluster computation here is irrelevant, + // hence define a no-op such solver to + // feed partition(): // struct no_op_cluster_solver_t { using index_type_t = index_type; - using size_type_t = index_type; + using size_type_t = index_type; using value_type_t = value_type; - std::pair solve( - handle_t const &handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, - value_type_t const *__restrict__ obs, - index_type_t *__restrict__ codes) const { + std::pair solve(handle_t const& handle, + thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { return std::make_pair(0, 0); } }; - raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver, - no_op_cluster_solver_t{}, labels.data(), - eigVals.data(), eigVecs.data()); + raft::spectral::partition(handle, + t_exe_p, + r_csr_m, + eig_solver, + no_op_cluster_solver_t{}, + labels.data(), + eigVals.data(), + eigVecs.data()); raft::copy(out, eigVecs.data() + n, n * n_components, stream); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index 5c2c78f0c3..b9426c284a 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -49,26 +49,34 @@ namespace linalg { // TODO: value_idx param needs to be used for this once FAISS is updated to use float32 // for indices so that the index types can be uniform template -__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, - T *vals, int *orows, int *ocols, T *ovals, - int n, int cnnz, Lambda reduction_op) { +__global__ void coo_symmetrize_kernel(int* row_ind, + int* rows, + int* cols, + T* vals, + int* orows, + int* ocols, + T* ovals, + int n, + int cnnz, + Lambda reduction_op) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < n) { int start_idx = row_ind[row]; // each thread processes one row - int stop_idx = get_stop_idx(row, n, cnnz, row_ind); + int stop_idx = get_stop_idx(row, n, cnnz, row_ind); - int row_nnz = 0; + int row_nnz = 0; int out_start_idx = start_idx * 2; for (int idx = 0; idx < stop_idx - start_idx; idx++) { int cur_row = rows[idx + start_idx]; int cur_col = cols[idx + start_idx]; - T cur_val = vals[idx + start_idx]; + T cur_val = vals[idx + start_idx]; int lookup_row = cur_col; - int t_start = row_ind[lookup_row]; // Start at - int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); + int t_start = row_ind[lookup_row]; // Start at + int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); T transpose = 0.0; @@ -79,7 +87,7 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, // done in a different thread. if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) { // If it exists already, set transposed value to existing value - transpose = vals[t_idx]; + transpose = vals[t_idx]; found_match = true; break; } @@ -126,10 +134,12 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, * @param stream: cuda stream to use */ template -void coo_symmetrize(COO *in, COO *out, +void coo_symmetrize(COO* in, + COO* out, Lambda reduction_op, // two-argument reducer std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); @@ -141,9 +151,16 @@ void coo_symmetrize(COO *in, COO *out, out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream); - coo_symmetrize_kernel<<>>( - in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(), - out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op); + coo_symmetrize_kernel<<>>(in_row_ind.data(), + in->rows(), + in->cols(), + in->vals(), + out->rows(), + out->cols(), + out->vals(), + in->n_rows, + in->nnz, + reduction_op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -159,14 +176,15 @@ void coo_symmetrize(COO *in, COO *out, * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction */ template -__global__ static void symmetric_find_size(const value_t *restrict data, - const value_idx *restrict indices, - const value_idx n, const int k, - value_idx *restrict row_sizes, - value_idx *restrict row_sizes2) { +__global__ static void symmetric_find_size(const value_t* restrict data, + const value_idx* restrict indices, + const value_idx n, + const int k, + value_idx* restrict row_sizes, + value_idx* restrict row_sizes2) +{ const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = - blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; const auto col = indices[row * k + j]; @@ -186,9 +204,11 @@ __global__ static void symmetric_find_size(const value_t *restrict data, * @param row_sizes2: Input row sum 2 array(n) for faster reduction */ template -__global__ static void reduce_find_size(const value_idx n, const int k, - value_idx *restrict row_sizes, - const value_idx *restrict row_sizes2) { +__global__ static void reduce_find_size(const value_idx n, + const int k, + value_idx* restrict row_sizes, + const value_idx* restrict row_sizes2) +{ const auto i = (blockIdx.x * blockDim.x) + threadIdx.x; if (i >= n) return; row_sizes[i] += (row_sizes2[i] + k); @@ -209,20 +229,21 @@ __global__ static void reduce_find_size(const value_idx n, const int k, * @param k: Number of n_neighbors */ template -__global__ static void symmetric_sum(value_idx *restrict edges, - const value_t *restrict data, - const value_idx *restrict indices, - value_t *restrict VAL, - value_idx *restrict COL, - value_idx *restrict ROW, const value_idx n, - const int k) { +__global__ static void symmetric_sum(value_idx* restrict edges, + const value_t* restrict data, + const value_idx* restrict indices, + value_t* restrict VAL, + value_idx* restrict COL, + value_idx* restrict ROW, + const value_idx n, + const int k) +{ const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = - blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; - const auto col = indices[row * k + j]; - const auto original = atomicAdd(&edges[row], value_idx(1)); + const auto col = indices[row * k + j]; + const auto original = atomicAdd(&edges[row], value_idx(1)); const auto transpose = atomicAdd(&edges[col], value_idx(1)); VAL[transpose] = VAL[original] = data[row * k + j]; @@ -252,26 +273,26 @@ __global__ static void symmetric_sum(value_idx *restrict edges, * @param stream: Input cuda stream * @param d_alloc device allocator for temporary buffers */ -template -void from_knn_symmetrize_matrix( - const value_idx *restrict knn_indices, const value_t *restrict knn_dists, - const value_idx n, const int k, COO *out, - cudaStream_t stream, std::shared_ptr d_alloc) { +template +void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices, + const value_t* restrict knn_dists, + const value_idx n, + const int k, + COO* out, + cudaStream_t stream, + std::shared_ptr d_alloc) +{ // (1) Find how much space needed in each row // We look through all datapoints and increment the count for each row. const dim3 threadsPerBlock(TPB_X, TPB_Y); - const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), - raft::ceildiv(k, TPB_Y)); + const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), raft::ceildiv(k, TPB_Y)); // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) raft::mr::device::buffer row_sizes(d_alloc, stream, n); - CUDA_CHECK( - cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); raft::mr::device::buffer row_sizes2(d_alloc, stream, n); - CUDA_CHECK( - cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); symmetric_find_size<<>>( knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data()); @@ -292,14 +313,12 @@ void from_knn_symmetrize_matrix( // This mirrors CSR matrix's row Pointer, were maximum bounds for each row // are calculated as the cumulative rolling sum of the previous rows. // Notice reusing old row_sizes2 memory - value_idx *edges = row_sizes2.data(); - thrust::device_ptr __edges = thrust::device_pointer_cast(edges); - thrust::device_ptr __row_sizes = - thrust::device_pointer_cast(row_sizes.data()); + value_idx* edges = row_sizes2.data(); + thrust::device_ptr __edges = thrust::device_pointer_cast(edges); + thrust::device_ptr __row_sizes = thrust::device_pointer_cast(row_sizes.data()); // Rolling cumulative sum - thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, - __row_sizes + n, __edges); + thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, __row_sizes + n, __edges); // (5) Perform final data + data.T operation in tandem with memcpying symmetric_sum<<>>( @@ -311,11 +330,17 @@ void from_knn_symmetrize_matrix( * Symmetrizes a COO matrix */ template -void symmetrize(const raft::handle_t &handle, const value_idx *rows, - const value_idx *cols, const value_t *vals, size_t m, size_t n, - size_t nnz, raft::sparse::COO &out) { +void symmetrize(const raft::handle_t& handle, + const value_idx* rows, + const value_idx* cols, + const value_t* vals, + size_t m, + size_t n, + size_t nnz, + raft::sparse::COO& out) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // copy rows to cols and cols to rows rmm::device_uvector symm_rows(nnz * 2, stream); @@ -331,13 +356,17 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows, raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream); // sort COO - raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2, - symm_rows.data(), symm_cols.data(), - symm_vals.data(), d_alloc, stream); - - raft::sparse::op::max_duplicates(handle, out, symm_rows.data(), - symm_cols.data(), symm_vals.data(), nnz * 2, - m, n); + raft::sparse::op::coo_sort((value_idx)m, + (value_idx)n, + (value_idx)nnz * 2, + symm_rows.data(), + symm_cols.data(), + symm_vals.data(), + d_alloc, + stream); + + raft::sparse::op::max_duplicates( + handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h index 6afe4ca8f6..ce90eb6702 100644 --- a/cpp/include/raft/sparse/linalg/transpose.h +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -57,29 +57,55 @@ namespace linalg { * @param[in] stream : Cuda stream for ordering events */ template -void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, - const value_idx *csr_indices, const value_t *csr_data, - value_idx *csc_indptr, value_idx *csc_indices, - value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols, +void csr_transpose(cusparseHandle_t handle, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data, + value_idx* csc_indptr, + value_idx* csc_indices, + value_t* csc_data, + value_idx csr_nrows, + value_idx csr_ncols, value_idx nnz, std::shared_ptr allocator, - cudaStream_t stream) { + cudaStream_t stream) +{ size_t convert_csc_workspace_size = 0; - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize( - handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, - csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, - &convert_csc_workspace_size, stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + &convert_csc_workspace_size, + stream)); raft::mr::device::buffer convert_csc_workspace( allocator, stream, convert_csc_workspace_size); - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc( - handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, - csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, - convert_csc_workspace.data(), stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + convert_csc_workspace.data(), + stream)); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh index f0d30b0cb7..36d426029b 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh @@ -28,10 +28,16 @@ namespace mst { namespace detail { template -__global__ void kernel_min_edge_per_vertex( - const edge_t* offsets, const vertex_t* indices, const alteration_t* weights, - const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, - const bool* mst_edge, alteration_t* min_edge_color, const vertex_t v) { +__global__ void kernel_min_edge_per_vertex(const edge_t* offsets, + const vertex_t* indices, + const alteration_t* weights, + const vertex_t* color, + const vertex_t* color_index, + edge_t* new_mst_edge, + const bool* mst_edge, + alteration_t* min_edge_color, + const vertex_t v) +{ edge_t tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned warp_id = tid / 32; @@ -41,14 +47,14 @@ __global__ void kernel_min_edge_per_vertex( __shared__ alteration_t min_edge_weight[32]; __shared__ vertex_t min_color[32]; - min_edge_index[lane_id] = std::numeric_limits::max(); + min_edge_index[lane_id] = std::numeric_limits::max(); min_edge_weight[lane_id] = std::numeric_limits::max(); - min_color[lane_id] = std::numeric_limits::max(); + min_color[lane_id] = std::numeric_limits::max(); __syncthreads(); vertex_t self_color_idx = color_index[warp_id]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // find the minimum edge associated per row // each thread in warp holds the minimum edge for @@ -56,20 +62,20 @@ __global__ void kernel_min_edge_per_vertex( if (warp_id < v) { // one row is associated with one warp edge_t row_start = offsets[warp_id]; - edge_t row_end = offsets[warp_id + 1]; + edge_t row_end = offsets[warp_id + 1]; // assuming one warp per row // find min for each thread in warp for (edge_t e = row_start + lane_id; e < row_end; e += 32) { alteration_t curr_edge_weight = weights[e]; - vertex_t successor_color_idx = color_index[indices[e]]; - vertex_t successor_color = color[successor_color_idx]; + vertex_t successor_color_idx = color_index[indices[e]]; + vertex_t successor_color = color[successor_color_idx]; if (!mst_edge[e] && self_color != successor_color) { if (curr_edge_weight < min_edge_weight[lane_id]) { - min_color[lane_id] = successor_color; + min_color[lane_id] = successor_color; min_edge_weight[lane_id] = curr_edge_weight; - min_edge_index[lane_id] = e; + min_edge_index[lane_id] = e; } } } @@ -82,9 +88,9 @@ __global__ void kernel_min_edge_per_vertex( for (int offset = 16; offset > 0; offset >>= 1) { if (lane_id < offset) { if (min_edge_weight[lane_id] > min_edge_weight[lane_id + offset]) { - min_color[lane_id] = min_color[lane_id + offset]; + min_color[lane_id] = min_color[lane_id + offset]; min_edge_weight[lane_id] = min_edge_weight[lane_id + offset]; - min_edge_index[lane_id] = min_edge_index[lane_id + offset]; + min_edge_index[lane_id] = min_edge_index[lane_id + offset]; } } __syncthreads(); @@ -102,19 +108,26 @@ __global__ void kernel_min_edge_per_vertex( } } -template -__global__ void min_edge_per_supervertex( - const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, - bool* mst_edge, const vertex_t* indices, const weight_t* weights, - const alteration_t* altered_weights, vertex_t* temp_src, vertex_t* temp_dst, - weight_t* temp_weights, const alteration_t* min_edge_color, const vertex_t v, - bool symmetrize_output) { +template +__global__ void min_edge_per_supervertex(const vertex_t* color, + const vertex_t* color_index, + edge_t* new_mst_edge, + bool* mst_edge, + const vertex_t* indices, + const weight_t* weights, + const alteration_t* altered_weights, + vertex_t* temp_src, + vertex_t* temp_dst, + weight_t* temp_weights, + const alteration_t* min_edge_color, + const vertex_t v, + bool symmetrize_output) +{ auto tid = get_1D_idx(); if (tid < v) { vertex_t vertex_color_idx = color_index[tid]; - vertex_t vertex_color = color[vertex_color_idx]; - edge_t edge_idx = new_mst_edge[tid]; + vertex_t vertex_color = color[vertex_color_idx]; + edge_t edge_idx = new_mst_edge[tid]; // check if valid outgoing edge was found // find minimum edge is same as minimum edge of whole supervertex @@ -129,32 +142,27 @@ __global__ void min_edge_per_supervertex( auto dst = indices[edge_idx]; if (!symmetrize_output) { auto dst_edge_idx = new_mst_edge[dst]; - auto dst_color = color[color_index[dst]]; + auto dst_color = color[color_index[dst]]; // vertices added each other // only if destination has found an edge // the edge points back to source // the edge is minimum edge found for dst color - if (dst_edge_idx != std::numeric_limits::max() && - indices[dst_edge_idx] == tid && + if (dst_edge_idx != std::numeric_limits::max() && indices[dst_edge_idx] == tid && min_edge_color[dst_color] == altered_weights[dst_edge_idx]) { - if (vertex_color > dst_color) { - add_edge = false; - } + if (vertex_color > dst_color) { add_edge = false; } } } if (add_edge) { - temp_src[tid] = tid; - temp_dst[tid] = dst; - temp_weights[tid] = weights[edge_idx]; + temp_src[tid] = tid; + temp_dst[tid] = dst; + temp_weights[tid] = weights[edge_idx]; mst_edge[edge_idx] = true; } } - if (!add_edge) { - new_mst_edge[tid] = std::numeric_limits::max(); - } + if (!add_edge) { new_mst_edge[tid] = std::numeric_limits::max(); } } } } @@ -162,9 +170,13 @@ __global__ void min_edge_per_supervertex( template __global__ void add_reverse_edge(const edge_t* new_mst_edge, const vertex_t* indices, - const weight_t* weights, vertex_t* temp_src, - vertex_t* temp_dst, weight_t* temp_weights, - const vertex_t v, bool symmetrize_output) { + const weight_t* weights, + vertex_t* temp_src, + vertex_t* temp_dst, + weight_t* temp_weights, + const vertex_t v, + bool symmetrize_output) +{ auto tid = get_1D_idx(); if (tid < v) { @@ -186,9 +198,7 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // if vertices did not pick each other // add a reverse edge - if (tid != neighbor_vertex_neighbor) { - reverse_needed = true; - } + if (tid != neighbor_vertex_neighbor) { reverse_needed = true; } } } @@ -197,8 +207,8 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // it is assumed the each vertex only picks one valid min edge // per cycle // hence, we store at index tid + v for the reverse edge scenario - temp_src[tid + v] = neighbor_vertex; - temp_dst[tid + v] = tid; + temp_src[tid + v] = neighbor_vertex; + temp_dst[tid + v] = tid; temp_weights[tid + v] = weights[edge_idx]; } } @@ -207,11 +217,13 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // executes for newly added mst edges and updates the colors of both vertices to the lower color template -__global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, +__global__ void min_pair_colors(const vertex_t v, + const vertex_t* indices, const edge_t* new_mst_edge, const vertex_t* color, const vertex_t* color_index, - vertex_t* next_color) { + vertex_t* next_color) +{ auto i = get_1D_idx(); if (i < v) { @@ -220,9 +232,9 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, if (edge_idx != std::numeric_limits::max()) { vertex_t neighbor_vertex = indices[edge_idx]; // vertex_t self_color = color[i]; - vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; - vertex_t neighbor_color_idx = color_index[neighbor_vertex]; + vertex_t self_color_idx = color_index[i]; + vertex_t self_color = color[self_color_idx]; + vertex_t neighbor_color_idx = color_index[neighbor_vertex]; vertex_t neighbor_super_color = color[neighbor_color_idx]; // update my own color as source of edge @@ -238,33 +250,36 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, // for each vertex, update color if it was changed in min_pair_colors kernel template -__global__ void update_colors(const vertex_t v, vertex_t* color, +__global__ void update_colors(const vertex_t v, + vertex_t* color, const vertex_t* color_index, - const vertex_t* next_color, bool* done) { + const vertex_t* next_color, + bool* done) +{ auto i = get_1D_idx(); if (i < v) { - vertex_t self_color = color[i]; + vertex_t self_color = color[i]; vertex_t self_color_idx = color_index[i]; - vertex_t new_color = next_color[self_color_idx]; + vertex_t new_color = next_color[self_color_idx]; // update self color to new smaller color if (self_color > new_color) { color[i] = new_color; - *done = false; + *done = false; } } } // point vertices to their final color index template -__global__ void final_color_indices(const vertex_t v, const vertex_t* color, - vertex_t* color_index) { +__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index) +{ auto i = get_1D_idx(); if (i < v) { vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // if self color is not equal to self color index, // it means self is not supervertex @@ -272,7 +287,7 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, // parent supervertex while (self_color_idx != self_color) { self_color_idx = color_index[self_color]; - self_color = color[self_color_idx]; + self_color = color[self_color_idx]; } // point to new supervertex @@ -282,22 +297,23 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu // Consider using curand device API instead of precomputed random_values array -template -__global__ void alteration_kernel(const vertex_t v, const edge_t e, +template +__global__ void alteration_kernel(const vertex_t v, + const edge_t e, const edge_t* offsets, const vertex_t* indices, - const weight_t* weights, alteration_t max, + const weight_t* weights, + alteration_t max, alteration_t* random_values, - alteration_t* altered_weights) { + alteration_t* altered_weights) +{ auto row = get_1D_idx(); if (row < v) { auto row_begin = offsets[row]; - auto row_end = offsets[row + 1]; + auto row_end = offsets[row + 1]; for (auto i = row_begin; i < row_end; i++) { - auto column = indices[i]; - altered_weights[i] = - weights[i] + max * (random_values[row] + random_values[column]); + auto column = indices[i]; + altered_weights[i] = weights[i] + max * (random_values[row] + random_values[column]); } } } @@ -305,17 +321,15 @@ __global__ void alteration_kernel(const vertex_t v, const edge_t e, template __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src, edge_t* mst_edge_count, - const vertex_t v) { + const vertex_t v) +{ auto tid = get_1D_idx(); // count number of new mst edges added - bool predicate = - tid < v && (mst_src[tid] != std::numeric_limits::max()); + bool predicate = tid < v && (mst_src[tid] != std::numeric_limits::max()); vertex_t block_count = __syncthreads_count(predicate); - if (threadIdx.x == 0 && block_count > 0) { - atomicAdd(mst_edge_count, block_count); - } + if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); } } } // namespace detail diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index c5ba4fcb4f..158f4cc314 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -46,21 +46,30 @@ typedef std::chrono::high_resolution_clock Clock; // curand generator uniform inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - float* outputPtr, size_t n) { + float* outputPtr, + size_t n) +{ return curandGenerateUniform(generator, outputPtr, n); } inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - double* outputPtr, size_t n) { + double* outputPtr, + size_t n) +{ return curandGenerateUniformDouble(generator, outputPtr, n); } -template -MST_solver::MST_solver( - const raft::handle_t& handle_, const edge_t* offsets_, - const vertex_t* indices_, const weight_t* weights_, const vertex_t v_, - const edge_t e_, vertex_t* color_, cudaStream_t stream_, - bool symmetrize_output_, bool initialize_colors_, int iterations_) +template +MST_solver::MST_solver(const raft::handle_t& handle_, + const edge_t* offsets_, + const vertex_t* indices_, + const weight_t* weights_, + const vertex_t v_, + const edge_t e_, + vertex_t* color_, + cudaStream_t stream_, + bool symmetrize_output_, + bool initialize_colors_, + int iterations_) : handle(handle_), offsets(offsets_), indices(indices_), @@ -82,12 +91,13 @@ MST_solver::MST_solver( stream(stream_), symmetrize_output(symmetrize_output_), initialize_colors(initialize_colors_), - iterations(iterations_) { - max_blocks = handle_.get_device_properties().maxGridSize[0]; + iterations(iterations_) +{ + max_blocks = handle_.get_device_properties().maxGridSize[0]; max_threads = handle_.get_device_properties().maxThreadsPerBlock; - sm_count = handle_.get_device_properties().multiProcessorCount; + sm_count = handle_.get_device_properties().multiProcessorCount; - //Initially, color holds the vertex id as color + // Initially, color holds the vertex id as color auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); if (initialize_colors_) { thrust::sequence(policy, color.begin(), color.end(), 0); @@ -98,10 +108,10 @@ MST_solver::MST_solver( thrust::sequence(policy, next_color.begin(), next_color.end(), 0); } -template +template raft::Graph_COO -MST_solver::solve() { +MST_solver::solve() +{ RAFT_EXPECTS(v > 0, "0 vertices"); RAFT_EXPECTS(e > 0, "0 edges"); RAFT_EXPECTS(offsets != nullptr, "Null offsets."); @@ -114,12 +124,13 @@ MST_solver::solve() { // Alterating the weights // this is done by identifying the lowest cost edge weight gap that is not 0, call this theta. - // For each edge, add noise that is less than theta. That is, generate a random number in the range [0.0, theta) and add it to each edge weight. + // For each edge, add noise that is less than theta. That is, generate a random number in the + // range [0.0, theta) and add it to each edge weight. alteration(); #ifdef MST_TIME auto stop = Clock::now(); - timer0 = duration_us(stop - start); + timer0 = duration_us(stop - start); #endif auto max_mst_edges = symmetrize_output ? 2 * v - 2 : v - 1; @@ -168,8 +179,8 @@ MST_solver::solve() { if (curr_mst_edge_count == prev_mst_edge_count[0]) { #ifdef MST_TIME std::cout << "Iterations: " << i << std::endl; - std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 - << "," << timer4 << "," << timer5 << std::endl; + std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 << "," << timer4 << "," + << timer5 << std::endl; #endif // exit here when reaching steady state break; @@ -179,8 +190,7 @@ MST_solver::solve() { start = Clock::now(); #endif // append the newly found MST edges to the final output - append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), - mst_result.weights.data()); + append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), mst_result.weights.data()); #ifdef MST_TIME stop = Clock::now(); timer4 += duration_us(stop - start); @@ -201,7 +211,7 @@ MST_solver::solve() { // result packaging thrust::host_vector host_mst_edge_count = mst_edge_count; - mst_result.n_edges = host_mst_edge_count[0]; + mst_result.n_edges = host_mst_edge_count[0]; mst_result.src.resize(mst_result.n_edges, stream); mst_result.dst.resize(mst_result.n_edges, stream); mst_result.weights.resize(mst_result.n_edges, stream); @@ -212,50 +222,46 @@ MST_solver::solve() { // ||y|-|x|| template struct alteration_functor { - __host__ __device__ weight_t - operator()(const thrust::tuple& t) { + __host__ __device__ weight_t operator()(const thrust::tuple& t) + { auto x = thrust::get<0>(t); auto y = thrust::get<1>(t); - x = x < 0 ? -x : x; - y = y < 0 ? -y : y; + x = x < 0 ? -x : x; + y = y < 0 ? -y : y; return x < y ? y - x : x - y; } }; // Compute the uper bound for the alteration -template -alteration_t -MST_solver::alteration_max() { +template +alteration_t MST_solver::alteration_max() +{ auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); rmm::device_vector tmp(e); thrust::device_ptr weights_ptr(weights); thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin()); - //sort tmp weights + // sort tmp weights thrust::sort(policy, tmp.begin(), tmp.end()); - //remove duplicates + // remove duplicates auto new_end = thrust::unique(policy, tmp.begin(), tmp.end()); - //min(a[i+1]-a[i])/2 - auto begin = - thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); - auto end = - thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); - auto init = tmp[1] - tmp[0]; - auto max = - thrust::transform_reduce(policy, begin, end, alteration_functor(), - init, thrust::minimum()); + // min(a[i+1]-a[i])/2 + auto begin = thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); + auto end = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); + auto init = tmp[1] - tmp[0]; + auto max = thrust::transform_reduce( + policy, begin, end, alteration_functor(), init, thrust::minimum()); return max / static_cast(2); } // Compute the alteration to make all undirected edge weight unique // Preserves weights order -template -void MST_solver::alteration() { +template +void MST_solver::alteration() +{ auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); // maximum alteration that does not change realtive weights order alteration_t max = alteration_max(); @@ -269,35 +275,32 @@ void MST_solver::alteration() { curandSetPseudoRandomGeneratorSeed(randGen, 1234567); // Initialize rand values - auto curand_status = - curand_generate_uniformX(randGen, rand_values.data().get(), v); + auto curand_status = curand_generate_uniformX(randGen, rand_values.data().get(), v); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed"); curand_status = curandDestroyGenerator(randGen); - RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, - "MST: CURAND cleanup failed"); + RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND cleanup failed"); - //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu + // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu detail::alteration_kernel<<>>( - v, e, offsets, indices, weights, max, rand_values.data().get(), - altered_weights.data().get()); + v, e, offsets, indices, weights, max, rand_values.data().get(), altered_weights.data().get()); } // updates colors of vertices by propagating the lower color to the higher -template -void MST_solver::label_prop( - vertex_t* mst_src, vertex_t* mst_dst) { +template +void MST_solver::label_prop(vertex_t* mst_src, + vertex_t* mst_dst) +{ // update the colors of both ends its until there is no change in colors thrust::host_vector curr_mst_edge_count = mst_edge_count; auto min_pair_nthreads = std::min(v, (vertex_t)max_threads); - auto min_pair_nblocks = std::min( - (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); + auto min_pair_nblocks = + std::min((v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); rmm::device_vector done(1, false); edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - vertex_t* color_ptr = color.data().get(); + vertex_t* color_ptr = color.data().get(); vertex_t* next_color_ptr = next_color.data().get(); bool* done_ptr = done.data().get(); @@ -314,84 +317,99 @@ void MST_solver::label_prop( i++; } - detail:: - final_color_indices<<>>( - v, color_ptr, color_index); + detail::final_color_indices<<>>( + v, color_ptr, color_index); #ifdef MST_TIME std::cout << "Label prop iterations: " << i << std::endl; #endif } // Finds the minimum edge from each vertex to the lowest color -template -void MST_solver::min_edge_per_vertex() { +template +void MST_solver::min_edge_per_vertex() +{ auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); - thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(), - std::numeric_limits::max()); - thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(), - std::numeric_limits::max()); + thrust::fill( + policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits::max()); + thrust::fill( + policy, new_mst_edge.begin(), new_mst_edge.end(), std::numeric_limits::max()); int n_threads = 32; - vertex_t* color_ptr = color.data().get(); - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - bool* mst_edge_ptr = mst_edge.data().get(); - alteration_t* min_edge_color_ptr = min_edge_color.data().get(); + vertex_t* color_ptr = color.data().get(); + edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); + bool* mst_edge_ptr = mst_edge.data().get(); + alteration_t* min_edge_color_ptr = min_edge_color.data().get(); alteration_t* altered_weights_ptr = altered_weights.data().get(); - detail::kernel_min_edge_per_vertex<<>>( - offsets, indices, altered_weights_ptr, color_ptr, color_index, - new_mst_edge_ptr, mst_edge_ptr, min_edge_color_ptr, v); + detail::kernel_min_edge_per_vertex<<>>(offsets, + indices, + altered_weights_ptr, + color_ptr, + color_index, + new_mst_edge_ptr, + mst_edge_ptr, + min_edge_color_ptr, + v); } // Finds the minimum edge from each supervertex to the lowest color -template -void MST_solver::min_edge_per_supervertex() { +template +void MST_solver::min_edge_per_supervertex() +{ auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); - thrust::fill(policy, temp_src.begin(), temp_src.end(), - std::numeric_limits::max()); + thrust::fill(policy, temp_src.begin(), temp_src.end(), std::numeric_limits::max()); - vertex_t* color_ptr = color.data().get(); - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - bool* mst_edge_ptr = mst_edge.data().get(); - alteration_t* min_edge_color_ptr = min_edge_color.data().get(); + vertex_t* color_ptr = color.data().get(); + edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); + bool* mst_edge_ptr = mst_edge.data().get(); + alteration_t* min_edge_color_ptr = min_edge_color.data().get(); alteration_t* altered_weights_ptr = altered_weights.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); - vertex_t* temp_dst_ptr = temp_dst.data().get(); - weight_t* temp_weights_ptr = temp_weights.data().get(); - - detail::min_edge_per_supervertex<<>>( - color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights, - altered_weights_ptr, temp_src_ptr, temp_dst_ptr, temp_weights_ptr, - min_edge_color_ptr, v, symmetrize_output); + vertex_t* temp_src_ptr = temp_src.data().get(); + vertex_t* temp_dst_ptr = temp_dst.data().get(); + weight_t* temp_weights_ptr = temp_weights.data().get(); + + detail::min_edge_per_supervertex<<>>(color_ptr, + color_index, + new_mst_edge_ptr, + mst_edge_ptr, + indices, + weights, + altered_weights_ptr, + temp_src_ptr, + temp_dst_ptr, + temp_weights_ptr, + min_edge_color_ptr, + v, + symmetrize_output); // the above kernel only adds directed mst edges in the case where // a pair of vertices don't pick the same min edge between them // so, now we add the reverse edge to make it undirected if (symmetrize_output) { - detail::add_reverse_edge<<>>( - new_mst_edge_ptr, indices, weights, temp_src_ptr, temp_dst_ptr, - temp_weights_ptr, v, symmetrize_output); + detail::add_reverse_edge<<>>(new_mst_edge_ptr, + indices, + weights, + temp_src_ptr, + temp_dst_ptr, + temp_weights_ptr, + v, + symmetrize_output); } } -template -void MST_solver::check_termination() { +template +void MST_solver::check_termination() +{ vertex_t nthreads = std::min(2 * v, (vertex_t)max_threads); - vertex_t nblocks = - std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); + vertex_t nblocks = std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); // count number of new mst edges edge_t* mst_edge_count_ptr = mst_edge_count.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); + vertex_t* temp_src_ptr = temp_src.data().get(); detail::kernel_count_new_mst_edges<<>>( temp_src_ptr, mst_edge_count_ptr, 2 * v); @@ -399,36 +417,40 @@ void MST_solver::check_termination() { template struct new_edges_functor { - __host__ __device__ bool operator()( - const thrust::tuple& t) { + __host__ __device__ bool operator()(const thrust::tuple& t) + { auto src = thrust::get<0>(t); return src != std::numeric_limits::max() ? true : false; } }; -template +template void MST_solver::append_src_dst_pair( - vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) { + vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) +{ auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); auto curr_mst_edge_count = prev_mst_edge_count[0]; // iterator to end of mst edges added to final output in previous iteration - auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple( - mst_src + curr_mst_edge_count, mst_dst + curr_mst_edge_count, - mst_weights + curr_mst_edge_count)); + auto src_dst_zip_end = + thrust::make_zip_iterator(thrust::make_tuple(mst_src + curr_mst_edge_count, + mst_dst + curr_mst_edge_count, + mst_weights + curr_mst_edge_count)); // iterator to new mst edges found - auto temp_src_dst_zip_begin = thrust::make_zip_iterator(thrust::make_tuple( - temp_src.begin(), temp_dst.begin(), temp_weights.begin())); + auto temp_src_dst_zip_begin = thrust::make_zip_iterator( + thrust::make_tuple(temp_src.begin(), temp_dst.begin(), temp_weights.begin())); auto temp_src_dst_zip_end = thrust::make_zip_iterator( thrust::make_tuple(temp_src.end(), temp_dst.end(), temp_weights.end())); // copy new mst edges to final output - thrust::copy_if(policy, temp_src_dst_zip_begin, temp_src_dst_zip_end, - src_dst_zip_end, new_edges_functor()); + thrust::copy_if(policy, + temp_src_dst_zip_begin, + temp_src_dst_zip_end, + src_dst_zip_end, + new_edges_functor()); } } // namespace mst diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh index 8f755de459..24127c993f 100644 --- a/cpp/include/raft/sparse/mst/detail/utils.cuh +++ b/cpp/include/raft/sparse/mst/detail/utils.cuh @@ -26,32 +26,29 @@ namespace mst { namespace detail { template -__device__ idx_t get_1D_idx() { +__device__ idx_t get_1D_idx() +{ return blockIdx.x * blockDim.x + threadIdx.x; } // somewhat smart vector print template -void printv(rmm::device_vector& vec, const std::string& name = "", - const size_t displ = 5) { +void printv(rmm::device_vector& vec, const std::string& name = "", const size_t displ = 5) +{ #ifdef MST_TIME std::cout.precision(15); std::cout << name << " size = " << vec.size() << std::endl; if (displ < vec.size()) { - thrust::copy(vec.begin(), vec.begin() + displ, - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.begin() + displ, std::ostream_iterator(std::cout, " ")); std::cout << " ... "; - thrust::copy(vec.end() - displ, vec.end(), - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.end() - displ, vec.end(), std::ostream_iterator(std::cout, " ")); } else { - thrust::copy(vec.begin(), vec.end(), - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.end(), std::ostream_iterator(std::cout, " ")); } std::cout << std::endl << std::endl; #endif } -#define duration_us(a) \ - std::chrono::duration_cast(a).count() +#define duration_us(a) std::chrono::duration_cast(a).count() } // namespace detail } // namespace mst diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh index 10c981445e..b49003467b 100644 --- a/cpp/include/raft/sparse/mst/mst.cuh +++ b/cpp/include/raft/sparse/mst/mst.cuh @@ -22,16 +22,30 @@ namespace raft { namespace mst { -template -raft::Graph_COO mst( - const raft::handle_t& handle, edge_t const* offsets, vertex_t const* indices, - weight_t const* weights, vertex_t const v, edge_t const e, vertex_t* color, - cudaStream_t stream, bool symmetrize_output = true, - bool initialize_colors = true, int iterations = 0) { - MST_solver mst_solver( - handle, offsets, indices, weights, v, e, color, stream, symmetrize_output, - initialize_colors, iterations); +template +raft::Graph_COO mst(const raft::handle_t& handle, + edge_t const* offsets, + vertex_t const* indices, + weight_t const* weights, + vertex_t const v, + edge_t const e, + vertex_t* color, + cudaStream_t stream, + bool symmetrize_output = true, + bool initialize_colors = true, + int iterations = 0) +{ + MST_solver mst_solver(handle, + offsets, + indices, + weights, + v, + e, + color, + stream, + symmetrize_output, + initialize_colors, + iterations); return mst_solver.solve(); } diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh index 833882ea0d..e32bcfacac 100644 --- a/cpp/include/raft/sparse/mst/mst_solver.cuh +++ b/cpp/include/raft/sparse/mst/mst_solver.cuh @@ -31,20 +31,27 @@ struct Graph_COO { edge_t n_edges; Graph_COO(vertex_t size, cudaStream_t stream) - : src(size, stream), dst(size, stream), weights(size, stream) {} + : src(size, stream), dst(size, stream), weights(size, stream) + { + } }; namespace mst { -template +template class MST_solver { public: - MST_solver(const raft::handle_t& handle_, const edge_t* offsets_, - const vertex_t* indices_, const weight_t* weights_, - const vertex_t v_, const edge_t e_, vertex_t* color_, - cudaStream_t stream_, bool symmetrize_output_, - bool initialize_colors_, int iterations_); + MST_solver(const raft::handle_t& handle_, + const edge_t* offsets_, + const vertex_t* indices_, + const weight_t* weights_, + const vertex_t v_, + const edge_t e_, + vertex_t* color_, + cudaStream_t stream_, + bool symmetrize_output_, + bool initialize_colors_, + int iterations_); raft::Graph_COO solve(); @@ -56,7 +63,7 @@ class MST_solver { bool symmetrize_output, initialize_colors; int iterations; - //CSR + // CSR const edge_t* offsets; const vertex_t* indices; const weight_t* weights; @@ -67,20 +74,16 @@ class MST_solver { vertex_t max_threads; vertex_t sm_count; - vertex_t* color_index; // represent each supervertex as a color - rmm::device_vector - min_edge_color; // minimum incident edge weight per color - rmm::device_vector new_mst_edge; // new minimum edge per vertex - rmm::device_vector - altered_weights; // weights to be used for mst + vertex_t* color_index; // represent each supervertex as a color + rmm::device_vector min_edge_color; // minimum incident edge weight per color + rmm::device_vector new_mst_edge; // new minimum edge per vertex + rmm::device_vector altered_weights; // weights to be used for mst + rmm::device_vector mst_edge_count; // total number of edges added after every iteration rmm::device_vector - mst_edge_count; // total number of edges added after every iteration - rmm::device_vector - prev_mst_edge_count; // total number of edges up to the previous iteration - rmm::device_vector - mst_edge; // mst output - true if the edge belongs in mst + prev_mst_edge_count; // total number of edges up to the previous iteration + rmm::device_vector mst_edge; // mst output - true if the edge belongs in mst rmm::device_vector next_color; // next iteration color - rmm::device_vector color; // index of color that vertex points to + rmm::device_vector color; // index of color that vertex points to // new src-dst pairs found per iteration rmm::device_vector temp_src; @@ -93,8 +96,7 @@ class MST_solver { void check_termination(); void alteration(); alteration_t alteration_max(); - void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, - weight_t* mst_weights); + void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights); }; } // namespace mst diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 562d506cfe..397fecaaea 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -42,15 +42,23 @@ namespace sparse { namespace op { template -__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, - const T *vals, int nnz, int *crows, - int *ccols, T *cvals, int *ex_scan, - int *cur_ex_scan, int m, T scalar) { +__global__ void coo_remove_scalar_kernel(const int* rows, + const int* cols, + const T* vals, + int nnz, + int* crows, + int* ccols, + T* cvals, + int* ex_scan, + int* cur_ex_scan, + int m, + T scalar) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { - int start = cur_ex_scan[row]; - int stop = get_stop_idx(row, m, nnz, cur_ex_scan); + int start = cur_ex_scan[row]; + int stop = get_stop_idx(row, m, nnz, cur_ex_scan); int cur_out_idx = ex_scan[row]; for (int idx = start; idx < stop; idx++) { @@ -82,37 +90,51 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, * @param stream: cuda stream to use */ template -void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, - int *crows, int *ccols, T *cvals, int *cnnz, - int *cur_cnnz, T scalar, int n, +void coo_remove_scalar(const int* rows, + const int* cols, + const T* vals, + int nnz, + int* crows, + int* ccols, + T* cvals, + int* cnnz, + int* cur_cnnz, + T scalar, + int n, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ raft::mr::device::buffer ex_scan(d_alloc, stream, n); raft::mr::device::buffer cur_ex_scan(d_alloc, stream, n); CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); - thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); - thrust::device_ptr dev_ex_scan = - thrust::device_pointer_cast(ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, - dev_ex_scan); + thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); + thrust::device_ptr dev_ex_scan = thrust::device_pointer_cast(ex_scan.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); - thrust::device_ptr dev_cur_ex_scan = - thrust::device_pointer_cast(cur_ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz, - dev_cur_cnnz + n, dev_cur_ex_scan); + thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); + thrust::device_ptr dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data()); + thrust::exclusive_scan( + thrust::cuda::par.on(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); dim3 grid(raft::ceildiv(n, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - coo_remove_scalar_kernel<<>>( - rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(), - dev_cur_ex_scan.get(), n, scalar); + coo_remove_scalar_kernel<<>>(rows, + cols, + vals, + nnz, + crows, + ccols, + cvals, + dev_ex_scan.get(), + dev_cur_ex_scan.get(), + n, + scalar); CUDA_CHECK(cudaPeekAtLastError()); } @@ -126,35 +148,44 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO *in, COO *out, T scalar, +void coo_remove_scalar(COO* in, + COO* out, + T scalar, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ raft::mr::device::buffer row_count_nz(d_alloc, stream, in->n_rows); raft::mr::device::buffer row_count(d_alloc, stream, in->n_rows); - CUDA_CHECK( - cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, - row_count_nz.data(), stream); + linalg::coo_degree_scalar( + in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr d_row_count_nz = - thrust::device_pointer_cast(row_count_nz.data()); - int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, - d_row_count_nz + in->n_rows); + thrust::device_ptr d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data()); + int out_nnz = + thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, d_row_count_nz + in->n_rows); out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream); - coo_remove_scalar(in->rows(), in->cols(), in->vals(), in->nnz, - out->rows(), out->cols(), out->vals(), - row_count_nz.data(), row_count.data(), scalar, - in->n_rows, d_alloc, stream); + coo_remove_scalar(in->rows(), + in->cols(), + in->vals(), + in->nnz, + out->rows(), + out->cols(), + out->vals(), + row_count_nz.data(), + row_count.data(), + scalar, + in->n_rows, + d_alloc, + stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -167,9 +198,11 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO *in, COO *out, +void coo_remove_zeros(COO* in, + COO* out, std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ coo_remove_scalar(in, out, T(0.0), d_alloc, stream); } diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index 53c9f89074..bc4d7bace5 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -46,25 +46,29 @@ namespace sparse { namespace op { template -__global__ void compute_duplicates_diffs_kernel(const value_idx *rows, - const value_idx *cols, - value_idx *diff, size_t nnz) { +__global__ void compute_duplicates_diffs_kernel(const value_idx* rows, + const value_idx* cols, + value_idx* diff, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; value_idx d = 1; - if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) - d = 0; + if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) d = 0; diff[tid] = d; } template -__global__ void max_duplicates_kernel(const value_idx *src_rows, - const value_idx *src_cols, - const value_t *src_vals, - const value_idx *index, - value_idx *out_rows, value_idx *out_cols, - value_t *out_vals, size_t nnz) { +__global__ void max_duplicates_kernel(const value_idx* src_rows, + const value_idx* src_cols, + const value_t* src_vals, + const value_idx* index, + value_idx* out_rows, + value_idx* out_cols, + value_t* out_vals, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < nnz) { @@ -96,13 +100,13 @@ __global__ void max_duplicates_kernel(const value_idx *src_rows, * @param[in] stream cuda ops will be ordered wrt this stream */ template -void compute_duplicates_mask(value_idx *mask, const value_idx *rows, - const value_idx *cols, size_t nnz, - cudaStream_t stream) { +void compute_duplicates_mask( + value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream) +{ CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream)); - compute_duplicates_diffs_kernel<<>>(rows, cols, mask, nnz); + compute_duplicates_diffs_kernel<<>>( + rows, cols, mask, nnz); } /** @@ -122,12 +126,17 @@ void compute_duplicates_mask(value_idx *mask, const value_idx *rows, * @param[in] stream cuda ops will be ordered wrt this stream */ template -void max_duplicates(const raft::handle_t &handle, - raft::sparse::COO &out, - const value_idx *rows, const value_idx *cols, - const value_t *vals, size_t nnz, size_t m, size_t n) { +void max_duplicates(const raft::handle_t& handle, + raft::sparse::COO& out, + const value_idx* rows, + const value_idx* cols, + const value_t* vals, + size_t nnz, + size_t m, + size_t n) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); @@ -136,8 +145,8 @@ void max_duplicates(const raft::handle_t &handle, compute_duplicates_mask(diff.data(), rows, cols, nnz, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(), - diff.data() + diff.size(), diff.data()); + thrust::exclusive_scan( + thrust::cuda::par.on(stream), diff.data(), diff.data() + diff.size(), diff.data()); // compute final size value_idx size = 0; diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh index 9e5034dc28..194a878ac1 100644 --- a/cpp/include/raft/sparse/op/row_op.cuh +++ b/cpp/include/raft/sparse/op/row_op.cuh @@ -38,12 +38,12 @@ namespace sparse { namespace op { template void> -__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, - Lambda op) { +__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op) +{ T row = blockIdx.x * TPB_X + threadIdx.x; if (row < n_rows) { T start_idx = row_ind[row]; - T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; + T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; op(row, start_idx, stop_idx); } } @@ -59,14 +59,12 @@ __global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, * @param op custom row operation functor accepting the row and beginning index. * @param stream cuda stream to use */ -template void> -void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op, - cudaStream_t stream) { +template void> +void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream) +{ dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_op_kernel - <<>>(row_ind, n_rows, nnz, op); + csr_row_op_kernel<<>>(row_ind, n_rows, nnz, op); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h index 46f4f41879..9bbe04cf34 100644 --- a/cpp/include/raft/sparse/op/slice.h +++ b/cpp/include/raft/sparse/op/slice.h @@ -50,10 +50,14 @@ namespace op { * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, - const value_idx *indptr, value_idx *indptr_out, - value_idx *start_offset, value_idx *stop_offset, - cudaStream_t stream) { +void csr_row_slice_indptr(value_idx start_row, + value_idx stop_row, + const value_idx* indptr, + value_idx* indptr_out, + value_idx* start_offset, + value_idx* stop_offset, + cudaStream_t stream) +{ raft::update_host(start_offset, indptr + start_row, 1, stream); raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream); @@ -63,11 +67,12 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1, // we add another 1 to stop row. - raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, - stream); + raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream); raft::linalg::unaryOp( - indptr_out, indptr_out, (stop_row + 2) - start_row, + indptr_out, + indptr_out, + (stop_row + 2) - start_row, [s_offset] __device__(value_idx input) { return input - s_offset; }, stream); } @@ -85,12 +90,15 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset, - const value_idx *indices, const value_t *data, - value_idx *indices_out, value_t *data_out, - cudaStream_t stream) { - raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, - stream); +void csr_row_slice_populate(value_idx start_offset, + value_idx stop_offset, + const value_idx* indices, + const value_t* data, + value_idx* indices_out, + value_t* data_out, + cudaStream_t stream) +{ + raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, stream); raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream); } diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index 9dbe2b67c5..3cab24fc09 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -42,7 +42,8 @@ namespace op { struct TupleComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -66,15 +67,21 @@ struct TupleComp { * @param stream: cuda stream to use */ template -void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, +void coo_sort(int m, + int n, + int nnz, + int* rows, + int* cols, + T* vals, // TODO: Remove this std::shared_ptr d_alloc, - cudaStream_t stream) { + cudaStream_t stream) +{ auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices, - coo_indices + nnz, vals, TupleComp()); + thrust::sort_by_key( + thrust::cuda::par.on(stream), coo_indices, coo_indices + nnz, vals, TupleComp()); } /** @@ -85,12 +92,12 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, * @param stream: the cuda stream to use */ template -void coo_sort(COO *const in, +void coo_sort(COO* const in, // TODO: Remove this std::shared_ptr d_alloc, - cudaStream_t stream) { - coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), - in->vals(), d_alloc, stream); + cudaStream_t stream) +{ + coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), d_alloc, stream); } /** @@ -104,16 +111,16 @@ void coo_sort(COO *const in, * @param[in] stream cuda stream for which to order cuda operations */ template -void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, - value_idx nnz, cudaStream_t stream) { +void coo_sort_by_weight( + value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream) +{ thrust::device_ptr t_rows = thrust::device_pointer_cast(rows); thrust::device_ptr t_cols = thrust::device_pointer_cast(cols); - thrust::device_ptr t_data = thrust::device_pointer_cast(data); + thrust::device_ptr t_data = thrust::device_pointer_cast(data); auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); - thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, - first); + thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, first); } }; // namespace op }; // end NAMESPACE sparse diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 8aae90f1d8..ec8bec6eb3 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -59,17 +59,20 @@ struct KeyValuePair { __host__ __device__ __forceinline__ KeyValuePair() {} /// Copy Constructor - __host__ __device__ __forceinline__ - KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) - : key(kvp.key), value(kvp.value) {} + __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) + : key(kvp.key), value(kvp.value) + { + } /// Constructor - __host__ __device__ __forceinline__ KeyValuePair(Key const &key, - Value const &value) - : key(key), value(value) {} + __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) + : key(key), value(value) + { + } /// Inequality operator - __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair &b) { + __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) + { return (value != b.value) || (key != b.key); } }; @@ -83,31 +86,32 @@ struct KeyValuePair { */ template struct FixConnectivitiesRedOp { - value_idx *colors; + value_idx* colors; value_idx m; - FixConnectivitiesRedOp(value_idx *colors_, value_idx m_) - : colors(colors_), m(m_){}; + FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){}; typedef typename cub::KeyValuePair KVP; - DI void operator()(value_idx rit, KVP *out, const KVP &other) { - if (rit < m && other.value < out->value && - colors[rit] != colors[other.key]) { - out->key = other.key; + DI void operator()(value_idx rit, KVP* out, const KVP& other) + { + if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) { + out->key = other.key; out->value = other.value; } } - DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) { + DI KVP operator()(value_idx rit, const KVP& a, const KVP& b) + { if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) { return a; } else return b; } - DI void init(value_t *out, value_t maxVal) { *out = maxVal; } - DI void init(KVP *out, value_t maxVal) { - out->key = -1; + DI void init(value_t* out, value_t maxVal) { *out = maxVal; } + DI void init(KVP* out, value_t maxVal) + { + out->key = -1; out->value = maxVal; } }; @@ -119,7 +123,8 @@ struct FixConnectivitiesRedOp { */ struct TupleComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -137,13 +142,9 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - DI KVP operator()(const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce @@ -158,13 +159,14 @@ struct CubKVPMinReduce { * @return total number of components */ template -value_idx get_n_components(value_idx *colors, size_t n_rows, +value_idx get_n_components(value_idx* colors, + size_t n_rows, std::shared_ptr d_alloc, - cudaStream_t stream) { - value_idx *map_ids; + cudaStream_t stream) +{ + value_idx* map_ids; int num_clusters; - raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, - d_alloc); + raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, d_alloc); d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream); return num_clusters; @@ -177,11 +179,12 @@ value_idx get_n_components(value_idx *colors, size_t n_rows, */ template struct LookupColorOp { - value_idx *colors; + value_idx* colors; - LookupColorOp(value_idx *colors_) : colors(colors_) {} + LookupColorOp(value_idx* colors_) : colors(colors_) {} - DI value_idx operator()(const cub::KeyValuePair &kvp) { + DI value_idx operator()(const cub::KeyValuePair& kvp) + { return colors[kvp.key]; } }; @@ -191,7 +194,8 @@ struct LookupColorOp { * the given array of components * @tparam value_idx * @tparam value_t - * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given array of components + * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given + * array of components * @param[out] nn_colors components of nearest neighbors for each vertex * @param[in] colors components of each vertex * @param[in] X original dense data @@ -201,25 +205,39 @@ struct LookupColorOp { * @param[in] stream cuda stream for which to order cuda operations */ template -void perform_1nn(cub::KeyValuePair *kvp, - value_idx *nn_colors, value_idx *colors, const value_t *X, - size_t n_rows, size_t n_cols, +void perform_1nn(cub::KeyValuePair* kvp, + value_idx* nn_colors, + value_idx* colors, + const value_t* X, + size_t n_rows, + size_t n_cols, std::shared_ptr d_alloc, - cudaStream_t stream, red_op reduction_op) { + cudaStream_t stream, + red_op reduction_op) +{ rmm::device_uvector workspace(n_rows, stream); rmm::device_uvector x_norm(n_rows, stream); - raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, - true, stream); - - raft::distance::fusedL2NN, - value_idx>( - kvp, X, X, x_norm.data(), x_norm.data(), n_rows, n_rows, n_cols, - workspace.data(), reduction_op, reduction_op, true, true, stream); + raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream); + + raft::distance::fusedL2NN, value_idx>( + kvp, + X, + X, + x_norm.data(), + x_norm.data(), + n_rows, + n_rows, + n_cols, + workspace.data(), + reduction_op, + reduction_op, + true, + true, + stream); LookupColorOp extract_colors_op(colors); - thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, - extract_colors_op); + thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op); } /** @@ -235,27 +253,33 @@ void perform_1nn(cub::KeyValuePair *kvp, * @param stream stream for which to order CUDA operations */ template -void sort_by_color(value_idx *colors, value_idx *nn_colors, - cub::KeyValuePair *kvp, - value_idx *src_indices, size_t n_rows, cudaStream_t stream) { +void sort_by_color(value_idx* colors, + value_idx* nn_colors, + cub::KeyValuePair* kvp, + value_idx* src_indices, + size_t n_rows, + cudaStream_t stream) +{ thrust::counting_iterator arg_sort_iter(0); - thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, - arg_sort_iter + n_rows, src_indices); + thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices); - auto keys = thrust::make_zip_iterator(thrust::make_tuple( - colors, nn_colors, (raft::linkage::KeyValuePair *)kvp)); + auto keys = thrust::make_zip_iterator( + thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair*)kvp)); auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, - TupleComp()); + thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, TupleComp()); } template -__global__ void min_components_by_color_kernel( - value_idx *out_rows, value_idx *out_cols, value_t *out_vals, - const value_idx *out_index, const value_idx *indices, - const cub::KeyValuePair *kvp, size_t nnz) { +__global__ void min_components_by_color_kernel(value_idx* out_rows, + value_idx* out_cols, + value_t* out_vals, + const value_idx* out_index, + const value_idx* indices, + const cub::KeyValuePair* kvp, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -284,19 +308,20 @@ __global__ void min_components_by_color_kernel( * @param[in] stream cuda stream for which to order cuda operations */ template -void min_components_by_color(raft::sparse::COO &coo, - const value_idx *out_index, - const value_idx *indices, - const cub::KeyValuePair *kvp, - size_t nnz, cudaStream_t stream) { +void min_components_by_color(raft::sparse::COO& coo, + const value_idx* out_index, + const value_idx* indices, + const cub::KeyValuePair* kvp, + size_t nnz, + cudaStream_t stream) +{ /** * Arrays should be ordered by: colors_indptr->colors_n->kvp.value * so the last element of each column in the input CSR should be * the min. */ - min_components_by_color_kernel<<>>(coo.rows(), coo.cols(), coo.vals(), - out_index, indices, kvp, nnz); + min_components_by_color_kernel<<>>( + coo.rows(), coo.cols(), coo.vals(), out_index, indices, kvp, nnz); } /** @@ -318,14 +343,18 @@ void min_components_by_color(raft::sparse::COO &coo, * @param[in] n_cols number of cols in X */ template -void connect_components(const raft::handle_t &handle, - raft::sparse::COO &out, - const value_t *X, const value_idx *orig_colors, - size_t n_rows, size_t n_cols, red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded) { +void connect_components( + const raft::handle_t& handle, + raft::sparse::COO& out, + const value_t* X, + const value_idx* orig_colors, + size_t n_rows, + size_t n_cols, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) +{ auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, "Fixing connectivities for an unconnected k-NN graph only " @@ -335,47 +364,52 @@ void connect_components(const raft::handle_t &handle, raft::copy_async(colors.data(), orig_colors, n_rows, stream); // Normalize colors so they are drawn from a monotonically increasing set - raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, - d_alloc, true); + raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, d_alloc, true); - value_idx n_components = - get_n_components(colors.data(), n_rows, d_alloc, stream); + value_idx n_components = get_n_components(colors.data(), n_rows, d_alloc, stream); /** * First compute 1-nn for all colors where the color of each data point * is guaranteed to be != color of its nearest neighbor. */ rmm::device_uvector nn_colors(n_rows, stream); - rmm::device_uvector> temp_inds_dists( - n_rows, stream); + rmm::device_uvector> temp_inds_dists(n_rows, stream); rmm::device_uvector src_indices(n_rows, stream); - perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X, - n_rows, n_cols, d_alloc, stream, reduction_op); + perform_1nn(temp_inds_dists.data(), + nn_colors.data(), + colors.data(), + X, + n_rows, + n_cols, + d_alloc, + stream, + reduction_op); /** * Sort data points by color (neighbors are not sorted) */ // max_color + 1 = number of connected components // sort nn_colors by key w/ original colors - sort_by_color(colors.data(), nn_colors.data(), temp_inds_dists.data(), - src_indices.data(), n_rows, stream); + sort_by_color( + colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream); /** * Take the min for any duplicate colors */ // Compute mask of duplicates rmm::device_uvector out_index(n_rows + 1, stream); - raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(), - nn_colors.data(), n_rows, stream); + raft::sparse::op::compute_duplicates_mask( + out_index.data(), colors.data(), nn_colors.data(), n_rows, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(), - out_index.data() + out_index.size(), out_index.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), + out_index.data(), + out_index.data() + out_index.size(), + out_index.data()); // compute final size value_idx size = 0; - raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, - stream); + raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); size++; @@ -383,14 +417,14 @@ void connect_components(const raft::handle_t &handle, raft::sparse::COO min_edges(d_alloc, stream); min_edges.allocate(size, n_rows, n_rows, true, stream); - min_components_by_color(min_edges, out_index.data(), src_indices.data(), - temp_inds_dists.data(), n_rows, stream); + min_components_by_color( + min_edges, out_index.data(), src_indices.data(), temp_inds_dists.data(), n_rows, stream); /** * Symmetrize resulting edge list */ - raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(), - min_edges.vals(), n_rows, n_rows, size, out); + raft::sparse::linalg::symmetrize( + handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out); } }; // end namespace linkage diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index 71fbb8ab3d..dbb24ee334 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -49,9 +49,11 @@ namespace selection { template struct csr_batcher_t { - csr_batcher_t(value_idx batch_size, value_idx n_rows, - const value_idx *csr_indptr, const value_idx *csr_indices, - const value_t *csr_data) + csr_batcher_t(value_idx batch_size, + value_idx n_rows, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data) : batch_start_(0), batch_stop_(0), batch_rows_(0), @@ -61,32 +63,42 @@ struct csr_batcher_t { csr_indices_(csr_indices), csr_data_(csr_data), batch_csr_start_offset_(0), - batch_csr_stop_offset_(0) {} + batch_csr_stop_offset_(0) + { + } - void set_batch(int batch_num) { + void set_batch(int batch_num) + { batch_start_ = batch_num * batch_size_; - batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing + batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing - if (batch_stop_ >= total_rows_) - batch_stop_ = total_rows_ - 1; // zero-based indexing + if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1; // zero-based indexing batch_rows_ = (batch_stop_ - batch_start_) + 1; } - value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_indptr( - batch_start_, batch_stop_, csr_indptr_, batch_indptr, - &batch_csr_start_offset_, &batch_csr_stop_offset_, stream); + value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_indptr(batch_start_, + batch_stop_, + csr_indptr_, + batch_indptr, + &batch_csr_start_offset_, + &batch_csr_stop_offset_, + stream); return batch_csr_stop_offset_ - batch_csr_start_offset_; } - void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_populate( - batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_, - csr_indices, csr_data, stream); + void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_, + batch_csr_stop_offset_, + csr_indices_, + csr_data_, + csr_indices, + csr_data, + stream); } value_idx batch_rows() const { return batch_rows_; } @@ -103,9 +115,9 @@ struct csr_batcher_t { value_idx total_rows_; - const value_idx *csr_indptr_; - const value_idx *csr_indices_; - const value_t *csr_data_; + const value_idx* csr_indptr_; + const value_idx* csr_indices_; + const value_t* csr_data_; value_idx batch_csr_start_offset_; value_idx batch_csr_stop_offset_; @@ -114,18 +126,26 @@ struct csr_batcher_t { template class sparse_knn_t { public: - sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_, - const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_, - int n_idx_cols_, const value_idx *queryIndptr_, - const value_idx *queryIndices_, const value_t *queryData_, - size_t queryNNZ_, int n_query_rows_, int n_query_cols_, - value_idx *output_indices_, value_t *output_dists_, int k_, - const raft::handle_t &handle_, - size_t batch_size_index_ = 2 << 14, // approx 1M - size_t batch_size_query_ = 2 << 14, - raft::distance::DistanceType metric_ = - raft::distance::DistanceType::L2Expanded, - float metricArg_ = 0) + sparse_knn_t(const value_idx* idxIndptr_, + const value_idx* idxIndices_, + const value_t* idxData_, + size_t idxNNZ_, + int n_idx_rows_, + int n_idx_cols_, + const value_idx* queryIndptr_, + const value_idx* queryIndices_, + const value_t* queryData_, + size_t queryNNZ_, + int n_query_rows_, + int n_query_cols_, + value_idx* output_indices_, + value_t* output_dists_, + int k_, + const raft::handle_t& handle_, + size_t batch_size_index_ = 2 << 14, // approx 1M + size_t batch_size_query_ = 2 << 14, + raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded, + float metricArg_ = 0) : idxIndptr(idxIndptr_), idxIndices(idxIndices_), idxData(idxData_), @@ -145,9 +165,12 @@ class sparse_knn_t { batch_size_index(batch_size_index_), batch_size_query(batch_size_query_), metric(metric_), - metricArg(metricArg_) {} + metricArg(metricArg_) + { + } - void run() { + void run() + { using namespace raft::sparse; int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query); @@ -158,37 +181,33 @@ class sparse_knn_t { for (int i = 0; i < n_batches_query; i++) { /** - * Compute index batch info - */ + * Compute index batch info + */ query_batcher.set_batch(i); /** - * Slice CSR to rows in batch - */ + * Slice CSR to rows in batch + */ - rmm::device_uvector query_batch_indptr( - query_batcher.batch_rows() + 1, handle.get_stream()); + rmm::device_uvector query_batch_indptr(query_batcher.batch_rows() + 1, + handle.get_stream()); - value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz( - query_batch_indptr.data(), handle.get_stream()); + value_idx n_query_batch_nnz = + query_batcher.get_batch_csr_indptr_nnz(query_batch_indptr.data(), handle.get_stream()); - rmm::device_uvector query_batch_indices(n_query_batch_nnz, - handle.get_stream()); - rmm::device_uvector query_batch_data(n_query_batch_nnz, - handle.get_stream()); + rmm::device_uvector query_batch_indices(n_query_batch_nnz, handle.get_stream()); + rmm::device_uvector query_batch_data(n_query_batch_nnz, handle.get_stream()); - query_batcher.get_batch_csr_indices_data(query_batch_indices.data(), - query_batch_data.data(), - handle.get_stream()); + query_batcher.get_batch_csr_indices_data( + query_batch_indices.data(), query_batch_data.data(), handle.get_stream()); // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent // batches and 1 space for the results of the merge, which get copied back to the top - rmm::device_uvector merge_buffer_indices(0, - handle.get_stream()); + rmm::device_uvector merge_buffer_indices(0, handle.get_stream()); rmm::device_uvector merge_buffer_dists(0, handle.get_stream()); - value_t *dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_ptr; + value_t* dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_ptr; int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index); csr_batcher_t idx_batcher( @@ -197,22 +216,19 @@ class sparse_knn_t { for (int j = 0; j < n_batches_idx; j++) { idx_batcher.set_batch(j); - merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); - merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); + merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); + merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); /** - * Slice CSR to rows in batch - */ - rmm::device_uvector idx_batch_indptr( - idx_batcher.batch_rows() + 1, handle.get_stream()); - rmm::device_uvector idx_batch_indices(0, - handle.get_stream()); + * Slice CSR to rows in batch + */ + rmm::device_uvector idx_batch_indptr(idx_batcher.batch_rows() + 1, + handle.get_stream()); + rmm::device_uvector idx_batch_indices(0, handle.get_stream()); rmm::device_uvector idx_batch_data(0, handle.get_stream()); - value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz( - idx_batch_indptr.data(), handle.get_stream()); + value_idx idx_batch_nnz = + idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), handle.get_stream()); idx_batch_indices.resize(idx_batch_nnz, handle.get_stream()); idx_batch_data.resize(idx_batch_nnz, handle.get_stream()); @@ -221,111 +237,126 @@ class sparse_knn_t { idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream()); /** - * Compute distances - */ - size_t dense_size = - idx_batcher.batch_rows() * query_batcher.batch_rows(); - rmm::device_uvector batch_dists(dense_size, - handle.get_stream()); - - CUDA_CHECK(cudaMemset(batch_dists.data(), 0, - batch_dists.size() * sizeof(value_t))); - - compute_distances(idx_batcher, query_batcher, idx_batch_nnz, - n_query_batch_nnz, idx_batch_indptr.data(), - idx_batch_indices.data(), idx_batch_data.data(), - query_batch_indptr.data(), query_batch_indices.data(), - query_batch_data.data(), batch_dists.data()); + * Compute distances + */ + size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows(); + rmm::device_uvector batch_dists(dense_size, handle.get_stream()); + + CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); + + compute_distances(idx_batcher, + query_batcher, + idx_batch_nnz, + n_query_batch_nnz, + idx_batch_indptr.data(), + idx_batch_indices.data(), + idx_batch_data.data(), + query_batch_indptr.data(), + query_batch_indices.data(), + query_batch_data.data(), + batch_dists.data()); // Build batch indices array - rmm::device_uvector batch_indices(batch_dists.size(), - handle.get_stream()); + rmm::device_uvector batch_indices(batch_dists.size(), handle.get_stream()); // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); - iota_fill(batch_indices.data(), batch_rows, batch_cols, - handle.get_stream()); + iota_fill(batch_indices.data(), batch_rows, batch_cols, handle.get_stream()); /** * Perform k-selection on batch & merge with other k-selections */ size_t merge_buffer_offset = batch_rows * k; - dists_merge_buffer_ptr = - merge_buffer_dists.data() + merge_buffer_offset; - indices_merge_buffer_ptr = - merge_buffer_indices.data() + merge_buffer_offset; - - perform_k_selection(idx_batcher, query_batcher, batch_dists.data(), - batch_indices.data(), dists_merge_buffer_ptr, + dists_merge_buffer_ptr = merge_buffer_dists.data() + merge_buffer_offset; + indices_merge_buffer_ptr = merge_buffer_indices.data() + merge_buffer_offset; + + perform_k_selection(idx_batcher, + query_batcher, + batch_dists.data(), + batch_indices.data(), + dists_merge_buffer_ptr, indices_merge_buffer_ptr); - value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; + value_t* dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; // Merge results of difference batches if necessary if (idx_batcher.batch_start() > 0) { - size_t merge_buffer_tmp_out = batch_rows * k * 2; - dists_merge_buffer_tmp_ptr = - merge_buffer_dists.data() + merge_buffer_tmp_out; - indices_merge_buffer_tmp_ptr = - merge_buffer_indices.data() + merge_buffer_tmp_out; - - merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(), - merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr, + size_t merge_buffer_tmp_out = batch_rows * k * 2; + dists_merge_buffer_tmp_ptr = merge_buffer_dists.data() + merge_buffer_tmp_out; + indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out; + + merge_batches(idx_batcher, + query_batcher, + merge_buffer_dists.data(), + merge_buffer_indices.data(), + dists_merge_buffer_tmp_ptr, indices_merge_buffer_tmp_ptr); } // copy merged output back into merge buffer partition for next iteration raft::copy_async(merge_buffer_indices.data(), indices_merge_buffer_tmp_ptr, - batch_rows * k, handle.get_stream()); + batch_rows * k, + handle.get_stream()); raft::copy_async(merge_buffer_dists.data(), - dists_merge_buffer_tmp_ptr, batch_rows * k, + dists_merge_buffer_tmp_ptr, + batch_rows * k, handle.get_stream()); } // Copy final merged batch to output array - raft::copy_async( - output_indices + (rows_processed * k), merge_buffer_indices.data(), - query_batcher.batch_rows() * k, handle.get_stream()); - raft::copy_async( - output_dists + (rows_processed * k), merge_buffer_dists.data(), - query_batcher.batch_rows() * k, handle.get_stream()); + raft::copy_async(output_indices + (rows_processed * k), + merge_buffer_indices.data(), + query_batcher.batch_rows() * k, + handle.get_stream()); + raft::copy_async(output_dists + (rows_processed * k), + merge_buffer_dists.data(), + query_batcher.batch_rows() * k, + handle.get_stream()); rows_processed += query_batcher.batch_rows(); } } private: - void merge_batches(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - value_t *merge_buffer_dists, - value_idx *merge_buffer_indices, value_t *out_dists, - value_idx *out_indices) { + void merge_batches(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + value_t* merge_buffer_dists, + value_idx* merge_buffer_indices, + value_t* out_dists, + value_idx* out_indices) + { // build translation buffer to shift resulting indices by the batch std::vector id_ranges; id_ranges.push_back(0); id_ranges.push_back(idx_batcher.batch_start()); rmm::device_uvector trans(id_ranges.size(), handle.get_stream()); - raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), - handle.get_stream()); + raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), handle.get_stream()); // combine merge buffers only if there's more than 1 partition to combine - raft::spatial::knn::knn_merge_parts( - merge_buffer_dists, merge_buffer_indices, out_dists, out_indices, - query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data()); + raft::spatial::knn::knn_merge_parts(merge_buffer_dists, + merge_buffer_indices, + out_dists, + out_indices, + query_batcher.batch_rows(), + 2, + k, + handle.get_stream(), + trans.data()); } void perform_k_selection(csr_batcher_t idx_batcher, csr_batcher_t query_batcher, - value_t *batch_dists, value_idx *batch_indices, - value_t *out_dists, value_idx *out_indices) { + value_t* batch_dists, + value_idx* batch_indices, + value_t* out_dists, + value_idx* out_indices) + { // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); // build translation buffer to shift resulting indices by the batch std::vector id_ranges; @@ -340,51 +371,60 @@ class sparse_knn_t { if (metric == raft::distance::DistanceType::InnerProduct) ascending = false; // kernel to slice first (min) k cols and copy into batched merge buffer - select_k(batch_dists, batch_indices, batch_rows, batch_cols, out_dists, - out_indices, ascending, n_neighbors, handle.get_stream()); + select_k(batch_dists, + batch_indices, + batch_rows, + batch_cols, + out_dists, + out_indices, + ascending, + n_neighbors, + handle.get_stream()); } - void compute_distances(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - size_t idx_batch_nnz, size_t query_batch_nnz, - value_idx *idx_batch_indptr, - value_idx *idx_batch_indices, value_t *idx_batch_data, - value_idx *query_batch_indptr, - value_idx *query_batch_indices, - value_t *query_batch_data, value_t *batch_dists) { + void compute_distances(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + size_t idx_batch_nnz, + size_t query_batch_nnz, + value_idx* idx_batch_indptr, + value_idx* idx_batch_indices, + value_t* idx_batch_data, + value_idx* query_batch_indptr, + value_idx* query_batch_indices, + value_t* query_batch_data, + value_t* batch_dists) + { /** * Compute distances */ - raft::sparse::distance::distances_config_t dist_config( - handle); + raft::sparse::distance::distances_config_t dist_config(handle); dist_config.b_nrows = idx_batcher.batch_rows(); dist_config.b_ncols = n_idx_cols; - dist_config.b_nnz = idx_batch_nnz; + dist_config.b_nnz = idx_batch_nnz; - dist_config.b_indptr = idx_batch_indptr; + dist_config.b_indptr = idx_batch_indptr; dist_config.b_indices = idx_batch_indices; - dist_config.b_data = idx_batch_data; + dist_config.b_data = idx_batch_data; dist_config.a_nrows = query_batcher.batch_rows(); dist_config.a_ncols = n_query_cols; - dist_config.a_nnz = query_batch_nnz; + dist_config.a_nnz = query_batch_nnz; - dist_config.a_indptr = query_batch_indptr; + dist_config.a_indptr = query_batch_indptr; dist_config.a_indices = query_batch_indices; - dist_config.a_data = query_batch_data; + dist_config.a_data = query_batch_data; if (raft::sparse::distance::supportedDistance.find(metric) == raft::sparse::distance::supportedDistance.end()) THROW("DistanceType not supported: %d", metric); - raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, - metricArg); + raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg); } const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices; - value_idx *output_indices; + value_idx* output_indices; const value_t *idxData, *queryData; - value_t *output_dists; + value_t* output_dists; size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query; @@ -394,52 +434,76 @@ class sparse_knn_t { int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k; - const raft::handle_t &handle; + const raft::handle_t& handle; }; /** - * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors - * using some distance implementation - * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) - * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) - * @param[in] idxData csr data array of the index matrix (size idxNNZ) - * @param[in] idxNNA number of non-zeros for sparse index matrix - * @param[in] n_idx_rows number of data samples in index matrix - * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) - * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) - * @param[in] queryData csr data array of the query matrix (size queryNNZ) - * @param[in] queryNNZ number of non-zeros for sparse query matrix - * @param[in] n_query_rows number of data samples in query matrix - * @param[in] n_query_cols number of features in query matrix - * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) - * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) - * @param[in] k the number of neighbors to query - * @param[in] cusparseHandle the initialized cusparseHandle instance to use - * @param[in] allocator device allocator instance to use - * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to - * @param[in] batch_size_index maximum number of rows to use from index matrix per batch - * @param[in] batch_size_query maximum number of rows to use from query matrix per batch - * @param[in] metric distance metric/measure to use - * @param[in] metricArg potential argument for metric (currently unused) - */ + * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors + * using some distance implementation + * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) + * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) + * @param[in] idxData csr data array of the index matrix (size idxNNZ) + * @param[in] idxNNA number of non-zeros for sparse index matrix + * @param[in] n_idx_rows number of data samples in index matrix + * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) + * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) + * @param[in] queryData csr data array of the query matrix (size queryNNZ) + * @param[in] queryNNZ number of non-zeros for sparse query matrix + * @param[in] n_query_rows number of data samples in query matrix + * @param[in] n_query_cols number of features in query matrix + * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) + * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) + * @param[in] k the number of neighbors to query + * @param[in] cusparseHandle the initialized cusparseHandle instance to use + * @param[in] allocator device allocator instance to use + * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to + * @param[in] batch_size_index maximum number of rows to use from index matrix per batch + * @param[in] batch_size_query maximum number of rows to use from query matrix per batch + * @param[in] metric distance metric/measure to use + * @param[in] metricArg potential argument for metric (currently unused) + */ template -void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices, - const value_t *idxData, size_t idxNNZ, int n_idx_rows, - int n_idx_cols, const value_idx *queryIndptr, - const value_idx *queryIndices, const value_t *queryData, - size_t queryNNZ, int n_query_rows, int n_query_cols, - value_idx *output_indices, value_t *output_dists, int k, - const raft::handle_t &handle, - size_t batch_size_index = 2 << 14, // approx 1M - size_t batch_size_query = 2 << 14, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2Expanded, - float metricArg = 0) { - sparse_knn_t( - idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr, - queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols, - output_indices, output_dists, k, handle, batch_size_index, batch_size_query, - metric, metricArg) +void brute_force_knn(const value_idx* idxIndptr, + const value_idx* idxIndices, + const value_t* idxData, + size_t idxNNZ, + int n_idx_rows, + int n_idx_cols, + const value_idx* queryIndptr, + const value_idx* queryIndices, + const value_t* queryData, + size_t queryNNZ, + int n_query_rows, + int n_query_cols, + value_idx* output_indices, + value_t* output_dists, + int k, + const raft::handle_t& handle, + size_t batch_size_index = 2 << 14, // approx 1M + size_t batch_size_query = 2 << 14, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, + float metricArg = 0) +{ + sparse_knn_t(idxIndptr, + idxIndices, + idxData, + idxNNZ, + n_idx_rows, + n_idx_cols, + queryIndptr, + queryIndices, + queryData, + queryNNZ, + n_query_rows, + n_query_cols, + output_indices, + output_dists, + k, + handle, + batch_size_index, + batch_size_query, + metric, + metricArg) .run(); } diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 1cf225087a..1308f5ce02 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -45,31 +45,34 @@ namespace selection { * @param m */ template -__global__ void fill_indices(value_idx *indices, size_t m, size_t nnz) { +__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz) +{ value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nnz) return; - value_idx v = tid / m; + value_idx v = tid / m; indices[tid] = v; } template -value_idx build_k(value_idx n_samples, int c) { +value_idx build_k(value_idx n_samples, int c) +{ // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering // approach on GPU" - return min(n_samples, - max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); + return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); } template -__global__ void conv_indices_kernel(in_t *inds, out_t *out, size_t nnz) { +__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; - out_t v = inds[tid]; + out_t v = inds[tid]; out[tid] = v; } template -void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { +void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream) +{ size_t blocks = ceildiv(size, (size_t)tpb); conv_indices_kernel<<>>(inds, out, size); } @@ -91,13 +94,18 @@ void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { * @param c */ template -void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, +void knn_graph(const handle_t& handle, + const value_t* X, + size_t m, + size_t n, distance::DistanceType metric, - raft::sparse::COO &out, int c = 15) { + raft::sparse::COO& out, + int c = 15) +{ int k = build_k(m, c); auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); size_t nnz = m * k; @@ -108,8 +116,8 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, size_t blocks = ceildiv(nnz, (size_t)256); fill_indices<<>>(rows.data(), k, nnz); - std::vector inputs; - inputs.push_back(const_cast(X)); + std::vector inputs; + inputs.push_back(const_cast(X)); std::vector sizes; sizes.push_back(m); @@ -119,15 +127,25 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, rmm::device_uvector int64_indices(nnz, stream); uint32_t knn_start = curTimeMillis(); - raft::spatial::knn::brute_force_knn( - handle, inputs, sizes, n, const_cast(X), m, int64_indices.data(), - data.data(), k, true, true, nullptr, metric); + raft::spatial::knn::brute_force_knn(handle, + inputs, + sizes, + n, + const_cast(X), + m, + int64_indices.data(), + data.data(), + k, + true, + true, + nullptr, + metric); // convert from current knn's 64-bit to 32-bit. conv_indices(int64_indices.data(), indices.data(), nnz, stream); - raft::sparse::linalg::symmetrize(handle, rows.data(), indices.data(), - data.data(), m, k, nnz, out); + raft::sparse::linalg::symmetrize( + handle, rows.data(), indices.data(), data.data(), m, k, nnz, out); } }; // namespace selection diff --git a/cpp/include/raft/sparse/selection/selection.cuh b/cpp/include/raft/sparse/selection/selection.cuh index 6066a36289..190e06b2cd 100644 --- a/cpp/include/raft/sparse/selection/selection.cuh +++ b/cpp/include/raft/sparse/selection/selection.cuh @@ -39,27 +39,33 @@ namespace raft { namespace sparse { namespace selection { -template -__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, - size_t n_cols, K *outK, IndexType *outV, - K initK, IndexType initV, int k) { +template +__global__ void select_k_kernel(K* inK, + IndexType* inV, + size_t n_rows, + size_t n_cols, + K* outK, + IndexType* outV, + K initK, + IndexType initV, + int k) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ K smemK[kNumWarps * warp_q]; __shared__ IndexType smemV[kNumWarps * warp_q]; - faiss::gpu::BlockSelect, - warp_q, thread_q, tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available int row = blockIdx.x; - int i = threadIdx.x; + int i = threadIdx.x; - int idx = row * n_cols; - K *inKStart = inK + idx + i; - IndexType *inVStart = inV + idx + i; + int idx = row * n_cols; + K* inKStart = inK + idx + i; + IndexType* inVStart = inV + idx + i; // Whole warps must participate in the selection int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize); @@ -86,27 +92,31 @@ __global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, } } -template -inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, - size_t n_cols, value_t *outK, value_idx *outV, - bool select_min, int k, cudaStream_t stream) { +template +inline void select_k_impl(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ auto grid = dim3(n_rows); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); - auto kInit = select_min ? faiss::gpu::Limits::getMax() - : faiss::gpu::Limits::getMin(); + auto kInit = + select_min ? faiss::gpu::Limits::getMax() : faiss::gpu::Limits::getMin(); auto vInit = -1; if (select_min) { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, - vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); } else { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, - vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); } CUDA_CHECK(cudaGetLastError()); } @@ -126,30 +136,37 @@ inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, * @param[in] stream CUDA stream to use */ template -inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, - value_t *outK, value_idx *outV, bool select_min, int k, - cudaStream_t stream) { +inline void select_k(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ if (k == 1) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 32) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 64) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 128) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 256) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 512) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 1024) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); } }; // namespace selection diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h index 63578bf1f3..56e8832e0a 100644 --- a/cpp/include/raft/sparse/utils.h +++ b/cpp/include/raft/sparse/utils.h @@ -26,7 +26,8 @@ namespace sparse { * @param[in] ncols number of blocks to quantize */ template -inline int block_dim(value_idx ncols) { +inline int block_dim(value_idx ncols) +{ int blockdim; if (ncols <= 32) blockdim = 32; @@ -54,9 +55,9 @@ inline int block_dim(value_idx ncols) { * @return */ template -__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, - G key) { - unsigned int mask = __ballot_sync(init_mask, true); +__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G key) +{ + unsigned int mask = __ballot_sync(init_mask, true); unsigned int peer_group = 0; bool is_peer; @@ -77,12 +78,14 @@ __device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, } #endif -__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) { +__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) +{ return __ffs(peer_group) - 1; } template -__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { +__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols) +{ int row = blockIdx.x; int tid = threadIdx.x; @@ -92,15 +95,16 @@ __global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { } template -void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols, - cudaStream_t stream) { +void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_t stream) +{ int blockdim = block_dim(ncols); iota_fill_block_kernel<<>>(indices, ncols); } template -__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) { +__device__ int get_stop_idx(T row, T m, T nnz, const T* ind) +{ int stop_idx = 0; if (row < (m - 1)) stop_idx = ind[row + 1]; diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index 77d7831b4a..f77a56164d 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -45,14 +45,16 @@ using deviceAllocator = raft::mr::device::allocator; * @param[in] D the dimensionality of the index array */ template -inline void approx_knn_build_index(raft::handle_t &handle, - raft::spatial::knn::knnIndex *index, - knnIndexParam *params, +inline void approx_knn_build_index(raft::handle_t& handle, + raft::spatial::knn::knnIndex* index, + knnIndexParam* params, raft::distance::DistanceType metric, - float metricArg, float *index_array, - value_idx n, value_idx D) { - detail::approx_knn_build_index(handle, index, params, metric, metricArg, - index_array, n, D); + float metricArg, + float* index_array, + value_idx n, + value_idx D) +{ + detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D); } /** @@ -69,12 +71,15 @@ inline void approx_knn_build_index(raft::handle_t &handle, * @param[in] n number of rows in the query array */ template -inline void approx_knn_search(raft::handle_t &handle, float *distances, - int64_t *indices, - raft::spatial::knn::knnIndex *index, value_idx k, - float *query_array, value_idx n) { - detail::approx_knn_search(handle, distances, indices, index, k, query_array, - n); +inline void approx_knn_search(raft::handle_t& handle, + float* distances, + int64_t* indices, + raft::spatial::knn::knnIndex* index, + value_idx k, + float* query_array, + value_idx n) +{ + detail::approx_knn_search(handle, distances, indices, index, k, query_array, n); } } // namespace knn diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index 6a6c7751c2..573a23181d 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -26,13 +26,14 @@ namespace spatial { namespace knn { struct knnIndex { - faiss::gpu::GpuIndex *index; + faiss::gpu::GpuIndex* index; raft::distance::DistanceType metric; float metricArg; - faiss::gpu::StandardGpuResources *gpu_res; + faiss::gpu::StandardGpuResources* gpu_res; int device; - ~knnIndex() { + ~knnIndex() + { delete index; delete gpu_res; } @@ -57,7 +58,8 @@ struct IVFParam : knnIndexParam { int nprobe; }; -struct IVFFlatParam : IVFParam {}; +struct IVFFlatParam : IVFParam { +}; struct IVFPQParam : IVFParam { int M; diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 6e4c99b646..7eb439c78b 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -56,115 +56,107 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype( - QuantizerType qtype) { +inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype) +{ switch (qtype) { - case QuantizerType::QT_8bit: - return faiss::ScalarQuantizer::QuantizerType::QT_8bit; + case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit; case QuantizerType::QT_8bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform; case QuantizerType::QT_4bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform; - case QuantizerType::QT_fp16: - return faiss::ScalarQuantizer::QuantizerType::QT_fp16; + case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16; case QuantizerType::QT_8bit_direct: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct; - case QuantizerType::QT_6bit: - return faiss::ScalarQuantizer::QuantizerType::QT_6bit; - default: - return (faiss::ScalarQuantizer::QuantizerType)qtype; + case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit; + default: return (faiss::ScalarQuantizer::QuantizerType)qtype; } } template -void approx_knn_ivfflat_build_index(knnIndex *index, IVFParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfflat_build_index( + knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = index->device; + config.device = index->device; faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFFlat *faiss_index = new faiss::gpu::GpuIndexIVFFlat( - index->gpu_res, D, params->nlist, faiss_metric, config); + faiss::gpu::GpuIndexIVFFlat* faiss_index = + new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfpq_build_index(knnIndex *index, IVFPQParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfpq_build_index( + knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFPQConfig config; - config.device = index->device; - config.usePrecomputedTables = params->usePrecomputedTables; - config.interleavedLayout = params->n_bits != 8; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFPQ *faiss_index = - new faiss::gpu::GpuIndexIVFPQ(index->gpu_res, D, params->nlist, params->M, - params->n_bits, faiss_metric, config); + config.device = index->device; + config.usePrecomputedTables = params->usePrecomputedTables; + config.interleavedLayout = params->n_bits != 8; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ( + index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfsq_build_index(knnIndex *index, IVFSQParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfsq_build_index( + knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFScalarQuantizerConfig config; - config.device = index->device; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::ScalarQuantizer::QuantizerType faiss_qtype = - build_faiss_qtype(params->qtype); - faiss::gpu::GpuIndexIVFScalarQuantizer *faiss_index = - new faiss::gpu::GpuIndexIVFScalarQuantizer(index->gpu_res, D, params->nlist, - faiss_qtype, faiss_metric, - params->encodeResidual); + config.device = index->device; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params->qtype); + faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer( + index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_build_index(raft::handle_t &handle, - raft::spatial::knn::knnIndex *index, - raft::spatial::knn::knnIndexParam *params, +void approx_knn_build_index(raft::handle_t& handle, + raft::spatial::knn::knnIndex* index, + raft::spatial::knn::knnIndexParam* params, raft::distance::DistanceType metric, - float metricArg, float *index_array, IntType n, - IntType D) { + float metricArg, + float* index_array, + IntType n, + IntType D) +{ int device; CUDA_CHECK(cudaGetDevice(&device)); - faiss::gpu::StandardGpuResources *gpu_res = - new faiss::gpu::StandardGpuResources(); + faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources(); gpu_res->noTempMemory(); gpu_res->setDefaultStream(device, handle.get_stream()); - index->gpu_res = gpu_res; - index->device = device; - index->index = nullptr; - index->metric = metric; + index->gpu_res = gpu_res; + index->device = device; + index->index = nullptr; + index->metric = metric; index->metricArg = metricArg; // perform preprocessing // k set to 0 (unused during preprocessing / revertion) - std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, 0, false, handle.get_stream(), - handle.get_device_allocator()); + std::unique_ptr> query_metric_processor = create_processor( + metric, n, D, 0, false, handle.get_stream(), handle.get_device_allocator()); query_metric_processor->preprocess(index_array); - if (dynamic_cast(params)) { - IVFFlatParam *IVFFlat_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFFlatParam* IVFFlat_param = dynamic_cast(params); approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D); std::vector h_index_array(n * D); - raft::update_host(h_index_array.data(), index_array, h_index_array.size(), - handle.get_stream()); + raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream()); query_metric_processor->revert(index_array); index->index->train(n, h_index_array.data()); index->index->add(n, h_index_array.data()); } else { - if (dynamic_cast(params)) { - IVFPQParam *IVFPQ_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFPQParam* IVFPQ_param = dynamic_cast(params); approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D); - } else if (dynamic_cast(params)) { - IVFSQParam *IVFSQ_param = dynamic_cast(params); + } else if (dynamic_cast(params)) { + IVFSQParam* IVFSQ_param = dynamic_cast(params); approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D); } else { ASSERT(index->index, "KNN index could not be initialized"); @@ -177,13 +169,23 @@ void approx_knn_build_index(raft::handle_t &handle, } template -void approx_knn_search(raft::handle_t &handle, float *distances, - int64_t *indices, raft::spatial::knn::knnIndex *index, - IntType k, float *query_array, IntType n) { +void approx_knn_search(raft::handle_t& handle, + float* distances, + int64_t* indices, + raft::spatial::knn::knnIndex* index, + IntType k, + float* query_array, + IntType n) +{ // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(index->metric, n, index->index->d, k, false, - handle.get_stream(), handle.get_device_allocator()); + create_processor(index->metric, + n, + index->index->d, + k, + false, + handle.get_stream(), + handle.get_device_allocator()); query_metric_processor->preprocess(query_array); index->index->search(n, query_array, k, distances, indices); @@ -194,13 +196,14 @@ void approx_knn_search(raft::handle_t &handle, float *distances, index->metric == raft::distance::DistanceType::L2SqrtUnexpanded || index->metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (index->metric == raft::distance::DistanceType::LpUnexpanded) - p = 1.0 / index->metricArg; + if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg; raft::linalg::unaryOp( - distances, distances, n * k, + distances, + distances, + n * k, [p] __device__(float input) { return powf(input, p); }, handle.get_stream()); } diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h index 0c0398a336..5618186dfc 100644 --- a/cpp/include/raft/spatial/knn/detail/common_faiss.h +++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h @@ -27,37 +27,26 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::MetricType build_faiss_metric( - raft::distance::DistanceType metric) { +inline faiss::MetricType build_faiss_metric(raft::distance::DistanceType metric) +{ switch (metric) { case raft::distance::DistanceType::CosineExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; case raft::distance::DistanceType::CorrelationExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::L2Expanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2Unexpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtExpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtUnexpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L1: - return faiss::MetricType::METRIC_L1; - case raft::distance::DistanceType::InnerProduct: - return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::LpUnexpanded: - return faiss::MetricType::METRIC_Lp; - case raft::distance::DistanceType::Linf: - return faiss::MetricType::METRIC_Linf; - case raft::distance::DistanceType::Canberra: - return faiss::MetricType::METRIC_Canberra; - case raft::distance::DistanceType::BrayCurtis: - return faiss::MetricType::METRIC_BrayCurtis; + case raft::distance::DistanceType::L2Expanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtExpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtUnexpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L1: return faiss::MetricType::METRIC_L1; + case raft::distance::DistanceType::InnerProduct: return faiss::MetricType::METRIC_INNER_PRODUCT; + case raft::distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp; + case raft::distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf; + case raft::distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra; + case raft::distance::DistanceType::BrayCurtis: return faiss::MetricType::METRIC_BrayCurtis; case raft::distance::DistanceType::JensenShannon: return faiss::MetricType::METRIC_JensenShannon; - default: - THROW("MetricType not supported: %d", metric); + default: THROW("MetricType not supported: %d", metric); } } diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 7d87254cb6..049c11514c 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -35,7 +35,8 @@ namespace knn { namespace detail { template -DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { +DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) +{ value_t sin_0 = sin(0.5 * (x1 - y1)); value_t sin_1 = sin(0.5 * (x2 - y2)); value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1; @@ -56,34 +57,36 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { * @param[in] n_index_rows number of rows in index array * @param[in] k number of closest neighbors to return */ -template -__global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, - size_t n_index_rows, int k) { +template +__global__ void haversine_knn_kernel(value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + int k) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; __shared__ value_idx smemV[kNumWarps * warp_q]; - faiss::gpu::BlockSelect, warp_q, thread_q, - tpb> - heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); // Grid is exactly sized to rows available int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize); - const value_t *query_ptr = query + (blockIdx.x * 2); - value_t x1 = query_ptr[0]; - value_t x2 = query_ptr[1]; + const value_t* query_ptr = query + (blockIdx.x * 2); + value_t x1 = query_ptr[0]; + value_t x2 = query_ptr[1]; int i = threadIdx.x; for (; i < limit; i += tpb) { - const value_t *idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t* idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -92,9 +95,9 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, // Handle last remainder fraction of a warp of elements if (i < n_index_rows) { - const value_t *idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t* idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -105,7 +108,7 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, for (int i = threadIdx.x; i < k; i += tpb) { out_dists[blockIdx.x * k + i] = smemK[i]; - out_inds[blockIdx.x * k + i] = smemV[i]; + out_inds[blockIdx.x * k + i] = smemV[i]; } } @@ -126,10 +129,15 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, * @param[in] stream stream to order kernel launch */ template -void haversine_knn(value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, - size_t n_index_rows, size_t n_query_rows, int k, - cudaStream_t stream) { +void haversine_knn(value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + size_t n_query_rows, + int k, + cudaStream_t stream) +{ haversine_knn_kernel<<>>( out_inds, out_dists, index, query, n_index_rows, k); } diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 09494e9eb1..a276ae45ad 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -43,13 +43,18 @@ namespace spatial { namespace knn { namespace detail { -template -__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, - value_t *outK, value_idx *outV, - size_t n_samples, int n_parts, - value_t initK, value_idx initV, int k, - value_idx *translations) { +template +__global__ void knn_merge_parts_kernel(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + value_t initK, + value_idx initV, + int k, + value_idx* translations) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; @@ -58,34 +63,33 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, /** * Uses shared memory */ - faiss::gpu::BlockSelect, warp_q, thread_q, - tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available - int row = blockIdx.x; + int row = blockIdx.x; int total_k = k * n_parts; int i = threadIdx.x; // Get starting pointers for cols in current thread - int part = i / k; + int part = i / k; size_t row_idx = (row * k) + (part * n_samples * k); int col = i % k; - value_t *inKStart = inK + (row_idx + col); - value_idx *inVStart = inV + (row_idx + col); + value_t* inKStart = inK + (row_idx + col); + value_idx* inVStart = inV + (row_idx + col); - int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); + int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); value_idx translation = 0; for (; i < limit; i += tpb) { translation = translations[part]; heap.add(*inKStart, (*inVStart) + translation); - part = (i + tpb) / k; + part = (i + tpb) / k; row_idx = (row * k) + (part * n_samples * k); col = (i + tpb) % k; @@ -108,22 +112,27 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, } } -template -inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { +template +inline void knn_merge_parts_impl(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ auto grid = dim3(n_samples); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); auto kInit = faiss::gpu::Limits::getMax(); auto vInit = -1; knn_merge_parts_kernel - <<>>(inK, inV, outK, outV, n_samples, n_parts, - kInit, vInit, k, translations); + <<>>( + inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations); CUDA_CHECK(cudaPeekAtLastError()); } @@ -142,10 +151,16 @@ inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, * @param translations mapping of index offsets for each partition */ template -inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { +inline void knn_merge_parts(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ if (k == 1) knn_merge_parts_impl( inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); @@ -195,27 +210,33 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm */ template -void brute_force_knn_impl(std::vector &input, std::vector &sizes, - IntType D, float *search_items, IntType n, - int64_t *res_I, float *res_D, IntType k, - std::shared_ptr allocator, - cudaStream_t userStream, - cudaStream_t *internalStreams = nullptr, - int n_int_streams = 0, bool rowMajorIndex = true, - bool rowMajorQuery = true, - std::vector *translations = nullptr, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2Expanded, - float metricArg = 0) { - ASSERT(input.size() == sizes.size(), - "input and sizes vectors should be the same size"); - - std::vector *id_ranges; +void brute_force_knn_impl( + std::vector& input, + std::vector& sizes, + IntType D, + float* search_items, + IntType n, + int64_t* res_I, + float* res_D, + IntType k, + std::shared_ptr allocator, + cudaStream_t userStream, + cudaStream_t* internalStreams = nullptr, + int n_int_streams = 0, + bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector* translations = nullptr, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, + float metricArg = 0) +{ + ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size"); + + std::vector* id_ranges; if (translations == nullptr) { // If we don't have explicit translations // for offsets of the indices, build them // from the local partitions - id_ranges = new std::vector(); + id_ranges = new std::vector(); int64_t total_n = 0; for (size_t i = 0; i < input.size(); i++) { id_ranges->push_back(total_n); @@ -228,31 +249,27 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, k, rowMajorQuery, userStream, - allocator); + create_processor(metric, n, D, k, rowMajorQuery, userStream, allocator); query_metric_processor->preprocess(search_items); - std::vector>> metric_processors( - input.size()); + std::vector>> metric_processors(input.size()); for (size_t i = 0; i < input.size(); i++) { - metric_processors[i] = create_processor( - metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); + metric_processors[i] = + create_processor(metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); metric_processors[i]->preprocess(input[i]); } int device; CUDA_CHECK(cudaGetDevice(&device)); - raft::mr::device::buffer trans(allocator, userStream, - id_ranges->size()); - raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), - userStream); + raft::mr::device::buffer trans(allocator, userStream, id_ranges->size()); + raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); raft::mr::device::buffer all_D(allocator, userStream, 0); raft::mr::device::buffer all_I(allocator, userStream, 0); - float *out_D = res_D; - int64_t *out_I = res_I; + float* out_D = res_D; + int64_t* out_I = res_I; if (input.size() > 1) { all_D.resize(input.size() * k * n, userStream); @@ -266,11 +283,10 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); for (size_t i = 0; i < input.size(); i++) { - float *out_d_ptr = out_D + (i * k * n); - int64_t *out_i_ptr = out_I + (i * k * n); + float* out_d_ptr = out_D + (i * k * n); + int64_t* out_i_ptr = out_I + (i * k * n); - cudaStream_t stream = - raft::select_stream(userStream, internalStreams, n_int_streams, i); + cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i); switch (metric) { case raft::distance::DistanceType::Haversine: @@ -279,8 +295,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, "Haversine distance requires 2 dimensions " "(latitude / longitude)."); - haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, - k, stream); + haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream); break; default: faiss::MetricType m = build_faiss_metric(metric); @@ -291,18 +306,18 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, gpu_res.setDefaultStream(device, stream); faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = metricArg; - args.k = k; - args.dims = D; - args.vectors = input[i]; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; args.vectorsRowMajor = rowMajorIndex; - args.numVectors = sizes[i]; - args.queries = search_items; + args.numVectors = sizes[i]; + args.queries = search_items; args.queriesRowMajor = rowMajorQuery; - args.numQueries = n; - args.outDistances = out_d_ptr; - args.outIndices = out_i_ptr; + args.numQueries = n; + args.outDistances = out_d_ptr; + args.outIndices = out_i_ptr; /** * @todo: Until FAISS supports pluggable allocation strategies, @@ -325,8 +340,7 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, if (input.size() > 1 || translations != nullptr) { // This is necessary for proper index translations. If there are // no translations or partitions to combine, it can be skipped. - knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, - trans.data()); + knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data()); } // Perform necessary post-processing @@ -334,14 +348,12 @@ void brute_force_knn_impl(std::vector &input, std::vector &sizes, metric == raft::distance::DistanceType::L2SqrtUnexpanded || metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (metric == raft::distance::DistanceType::LpUnexpanded) - p = 1.0 / metricArg; + if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg; raft::linalg::unaryOp( - res_D, res_D, n * k, - [p] __device__(float input) { return powf(input, p); }, userStream); + res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream); } query_metric_processor->revert(search_items); diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index a645412c2f..6e983d1f42 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -39,11 +39,11 @@ using deviceAllocator = raft::mr::device::allocator; template class MetricProcessor { public: - virtual void preprocess(math_t *data) {} + virtual void preprocess(math_t* data) {} - virtual void revert(math_t *data) {} + virtual void revert(math_t* data) {} - virtual void postprocess(math_t *data) {} + virtual void postprocess(math_t* data) {} virtual ~MetricProcessor() = default; }; @@ -60,7 +60,10 @@ class CosineMetricProcessor : public MetricProcessor { raft::mr::device::buffer colsums_; public: - CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, + CosineMetricProcessor(size_t n_rows, + size_t n_cols, + int k, + bool row_major, cudaStream_t stream, std::shared_ptr allocator) : device_allocator_(allocator), @@ -69,30 +72,51 @@ class CosineMetricProcessor : public MetricProcessor { n_cols_(n_cols), n_rows_(n_rows), row_major_(row_major), - k_(k) {} + k_(k) + { + } - void preprocess(math_t *data) { - raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_, - raft::linalg::NormType::L2Norm, row_major_, stream_, + void preprocess(math_t* data) + { + raft::linalg::rowNorm(colsums_.data(), + data, + n_cols_, + n_rows_, + raft::linalg::NormType::L2Norm, + row_major_, + stream_, [] __device__(math_t in) { return sqrtf(in); }); raft::linalg::matrixVectorOp( - data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + data, + data, + colsums_.data(), + n_cols_, + n_rows_, + row_major_, + false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; }, stream_); } - void revert(math_t *data) { + void revert(math_t* data) + { raft::linalg::matrixVectorOp( - data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + data, + data, + colsums_.data(), + n_cols_, + n_rows_, + row_major_, + false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; }, stream_); } - void postprocess(math_t *data) { + void postprocess(math_t* data) + { raft::linalg::unaryOp( - data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, - stream_); + data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_); } ~CosineMetricProcessor() = default; @@ -103,43 +127,64 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { using cosine = CosineMetricProcessor; public: - CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k, - bool row_major, cudaStream_t stream, + CorrelationMetricProcessor(size_t n_rows, + size_t n_cols, + int k, + bool row_major, + cudaStream_t stream, std::shared_ptr allocator) - : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, - allocator), - means_(allocator, stream, n_rows) {} + : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, allocator), + means_(allocator, stream, n_rows) + { + } - void preprocess(math_t *data) { + void preprocess(math_t* data) + { math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_; - raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_, - (math_t)0.0, cosine::row_major_, true, + raft::linalg::reduce(means_.data(), + data, + cosine::n_cols_, + cosine::n_rows_, + (math_t)0.0, + cosine::row_major_, + true, cosine::stream_); raft::linalg::unaryOp( - means_.data(), means_.data(), cosine::n_rows_, + means_.data(), + means_.data(), + cosine::n_rows_, [=] __device__(math_t in) { return in * normalizer_const; }, cosine::stream_); - raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_, - cosine::n_rows_, cosine::row_major_, false, + raft::stats::meanCenter(data, + data, + means_.data(), + cosine::n_cols_, + cosine::n_rows_, + cosine::row_major_, + false, cosine::stream_); CosineMetricProcessor::preprocess(data); } - void revert(math_t *data) { + void revert(math_t* data) + { CosineMetricProcessor::revert(data); - raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_, - cosine::n_rows_, cosine::row_major_, false, + raft::stats::meanAdd(data, + data, + means_.data(), + cosine::n_cols_, + cosine::n_rows_, + cosine::row_major_, + false, cosine::stream_); } - void postprocess(math_t *data) { - CosineMetricProcessor::postprocess(data); - } + void postprocess(math_t* data) { CosineMetricProcessor::postprocess(data); } ~CorrelationMetricProcessor() = default; @@ -149,33 +194,36 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { template class DefaultMetricProcessor : public MetricProcessor { public: - void preprocess(math_t *data) {} + void preprocess(math_t* data) {} - void revert(math_t *data) {} + void revert(math_t* data) {} - void postprocess(math_t *data) {} + void postprocess(math_t* data) {} ~DefaultMetricProcessor() = default; }; template inline std::unique_ptr> create_processor( - distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, - cudaStream_t userStream, std::shared_ptr allocator) { - MetricProcessor *mp = nullptr; + distance::DistanceType metric, + int n, + int D, + int k, + bool rowMajorQuery, + cudaStream_t userStream, + std::shared_ptr allocator) +{ + MetricProcessor* mp = nullptr; switch (metric) { case distance::DistanceType::CosineExpanded: - mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, - allocator); + mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); break; case distance::DistanceType::CorrelationExpanded: - mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, - userStream, allocator); + mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); break; - default: - mp = new DefaultMetricProcessor(); + default: mp = new DefaultMetricProcessor(); } return std::unique_ptr>(mp); diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index a3a1972c13..42ee11ba5b 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -28,12 +28,17 @@ namespace knn { using deviceAllocator = raft::mr::device::allocator; template -inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { - detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, - translations); +inline void knn_merge_parts(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ + detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); } /** @@ -59,23 +64,42 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param[in] expanded should lp-based distances be returned in their expanded * form (e.g., without raising to the 1/p power). */ -inline void brute_force_knn( - raft::handle_t const &handle, std::vector &input, - std::vector &sizes, int D, float *search_items, int n, int64_t *res_I, - float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true, - std::vector *translations = nullptr, - distance::DistanceType metric = distance::DistanceType::L2Unexpanded, - float metric_arg = 2.0f) { - ASSERT(input.size() == sizes.size(), - "input and sizes vectors must be the same size"); +inline void brute_force_knn(raft::handle_t const& handle, + std::vector& input, + std::vector& sizes, + int D, + float* search_items, + int n, + int64_t* res_I, + float* res_D, + int k, + bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector* translations = nullptr, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded, + float metric_arg = 2.0f) +{ + ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size"); std::vector int_streams = handle.get_internal_streams(); - detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D, - k, handle.get_device_allocator(), - handle.get_stream(), int_streams.data(), - handle.get_num_internal_streams(), rowMajorIndex, - rowMajorQuery, translations, metric, metric_arg); + detail::brute_force_knn_impl(input, + sizes, + D, + search_items, + n, + res_I, + res_D, + k, + handle.get_device_allocator(), + handle.get_stream(), + int_streams.data(), + handle.get_num_internal_streams(), + rowMajorIndex, + rowMajorQuery, + translations, + metric, + metric_arg); } } // namespace knn diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 922ae7cfab..7032a0009e 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -24,8 +24,7 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct cluster_solver_config_t { size_type_t n_clusters; size_type_t maxIter; @@ -35,25 +34,37 @@ struct cluster_solver_config_t { unsigned long long seed{123456}; }; -template +template struct kmeans_solver_t { - explicit kmeans_solver_t(cluster_solver_config_t const& config) - : config_(config) {} + explicit kmeans_solver_t( + cluster_solver_config_t const& config) + : config_(config) + { + } template - std::pair solve( - handle_t const& handle, thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, size_type_t dim, - value_type_t const* __restrict__ obs, - index_type_t* __restrict__ codes) const { + std::pair solve(handle_t const& handle, + thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, - config_.tol, config_.maxIter, obs, codes, residual, iters, + kmeans(handle, + t_exe_policy, + n_obs_vecs, + dim, + config_.n_clusters, + config_.tol, + config_.maxIter, + obs, + codes, + residual, + iters, config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index e36dca2e0c..156b996586 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -23,8 +23,7 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct eigen_solver_config_t { size_type_t n_eigVecs; size_type_t maxIter; @@ -34,42 +33,59 @@ struct eigen_solver_config_t { bool reorthogonalize{false}; unsigned long long seed{ - 1234567}; // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in + // Lanczos; was not the case before: there were places where a default seed = 123456 + // was used; this may trigger slightly different # solver iterations }; -template +template struct lanczos_solver_t { - explicit lanczos_solver_t(eigen_solver_config_t const& config) - : config_(config) {} + explicit lanczos_solver_t( + eigen_solver_config_t const& config) + : config_(config) + { + } - index_type_t solve_smallest_eigenvectors( - handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const { + index_type_t solve_smallest_eigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, - config_.restartIter, config_.tol, - config_.reorthogonalize, iters, eigVals, - eigVecs, config_.seed); + computeSmallestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed); return iters; } - index_type_t solve_largest_eigenvectors( - handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const { + index_type_t solve_largest_eigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, - config_.restartIter, config_.tol, - config_.reorthogonalize, iters, eigVals, eigVecs, + computeLargestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, config_.seed); return iters; } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index fb05bff3e2..e0c3565b77 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -44,15 +44,15 @@ using namespace raft::linalg; // Useful grid settings // ========================================================= -constexpr unsigned int BLOCK_SIZE = 1024; -constexpr unsigned int WARP_SIZE = 32; +constexpr unsigned int BLOCK_SIZE = 1024; +constexpr unsigned int WARP_SIZE = 32; constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // ========================================================= // CUDA kernels // ========================================================= -/** +/** * @brief Compute distances between observation vectors and centroids * Block dimensions should be (warpSize, 1, * blockSize/warpSize). Ideally, the grid is large enough so there @@ -76,11 +76,13 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * initialized to zero. */ template -static __global__ void computeDistances( - index_type_t n, index_type_t d, index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists) { +static __global__ void computeDistances(index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) +{ // Loop index index_type_t i; @@ -115,12 +117,10 @@ static __global__ void computeDistances( // Perform reduction on warp for (i = WARP_SIZE / 2; i > 0; i /= 2) - dist_private += - __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); + dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); // Write result to global memory - if (threadIdx.x == 0) - atomicAdd(dists + IDX(gidz, gidy, n), dist_private); + if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private); // Move to another observation vector gidz += blockDim.z * gridDim.z; @@ -135,8 +135,8 @@ static __global__ void computeDistances( } } -/** - * @brief Find closest centroid to observation vectors. +/** + * @brief Find closest centroid to observation vectors. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -157,10 +157,12 @@ static __global__ void computeDistances( * cluster. Entries must be initialized to zero. */ template -static __global__ void minDistances(index_type_t n, index_type_t k, +static __global__ void minDistances(index_type_t n, + index_type_t k, value_type_t* __restrict__ dists, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) { + index_type_t* __restrict__ clusterSizes) +{ // Loop index index_type_t i, j; @@ -179,8 +181,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k, dist_min = dists[IDX(i, 0, n)]; for (j = 1; j < k; ++j) { dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; } // Transfer result to global memory @@ -195,8 +197,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k, } } -/** - * @brief Check if newly computed distances are smaller than old distances. +/** + * @brief Check if newly computed distances are smaller than old distances. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -219,7 +221,8 @@ static __global__ void minDistances2(index_type_t n, value_type_t* __restrict__ dists_old, const value_type_t* __restrict__ dists_new, index_type_t* __restrict__ codes_old, - index_type_t code_new) { + index_type_t code_new) +{ // Loop index index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; @@ -244,7 +247,7 @@ static __global__ void minDistances2(index_type_t n, } } -/** +/** * @brief Compute size of k-means clusters. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. @@ -256,9 +259,11 @@ static __global__ void minDistances2(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void computeClusterSizes( - index_type_t n, index_type_t k, const index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) { +static __global__ void computeClusterSizes(index_type_t n, + index_type_t k, + const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) +{ index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { atomicAdd(clusterSizes + codes[i], 1); @@ -266,8 +271,8 @@ static __global__ void computeClusterSizes( } } -/** - * @brief Divide rows of centroid matrix by cluster sizes. +/** + * @brief Divide rows of centroid matrix by cluster sizes. * Divides the ith column of the sum matrix by the size of the ith * cluster. If the sum matrix has been initialized so that the ith * row is the sum of all observation vectors in the ith cluster, @@ -288,9 +293,11 @@ static __global__ void computeClusterSizes( * column is the mean position of a cluster). */ template -static __global__ void divideCentroids( - index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids) { +static __global__ void divideCentroids(index_type_t d, + index_type_t k, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) +{ // Global indices index_type_t gidx, gidy; @@ -341,15 +348,17 @@ static __global__ void divideCentroids( * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template +template static int chooseNewCentroid(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, + index_type_t n, + index_type_t d, + index_type_t k, value_type_t rand, const value_type_t* __restrict__ obs, value_type_t* __restrict__ dists, - value_type_t* __restrict__ centroid) { + value_type_t* __restrict__ centroid) +{ // Cumulative sum of distances value_type_t* distsCumSum = dists + n; // Residual sum of squares @@ -358,43 +367,43 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t obsIndex; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Compute cumulative sum of distances - thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::inclusive_scan(thrust_exec_policy, + thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), - cudaMemcpyDeviceToHost, stream)); + CUDA_TRY(cudaMemcpyAsync( + &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); // Randomly choose observation vector // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) // - //seg-faults due to Thrust bug - //on binary-search-like algorithms - //when run with stream dependent - //execution policies; fixed on Thrust GitHub - //hence replace w/ linear interpolation, - //until the Thrust issue gets resolved: + // seg-faults due to Thrust bug + // on binary-search-like algorithms + // when run with stream dependent + // execution policies; fixed on Thrust GitHub + // hence replace w/ linear interpolation, + // until the Thrust issue gets resolved: // // obsIndex = (thrust::lower_bound( // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - // thrust::device_pointer_cast(distsCumSum)); // - //linear interpolation logic: + // linear interpolation logic: //{ value_type_t minSum{0}; - CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), - cudaMemcpyDeviceToHost, stream)); + CUDA_TRY( + cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); CHECK_CUDA(stream); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); - obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / - (distsSum - minSum)); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / (distsSum - minSum)); } else { obsIndex = 0; } @@ -405,21 +414,23 @@ static int chooseNewCentroid(handle_t const& handle, obsIndex = min(obsIndex, n - 1); // Record new centroid position - CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), - d * sizeof(value_type_t), cudaMemcpyDeviceToDevice, + CUDA_TRY(cudaMemcpyAsync(centroid, + obs + IDX(0, obsIndex, d), + d * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); return 0; } /** - * @brief Choose initial cluster centroids for k-means algorithm. + * @brief Choose initial cluster centroids for k-means algorithm. * Centroids are randomly chosen with k-means++ algorithm * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy + * @param thrust_exec_policy thrust execution policy * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -439,14 +450,19 @@ static int chooseNewCentroid(handle_t const& handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template -static int initializeCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists, - unsigned long long seed) { +template +static int initializeCentroids(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ dists, + unsigned long long seed) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -459,7 +475,7 @@ static int initializeCentroids( thrust::uniform_real_distribution uniformDist(0, 1); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); constexpr index_type_t grid_lower_bound{65535}; @@ -471,36 +487,43 @@ static int initializeCentroids( dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; // CUDA grid dimensions - dim3 gridDim_warp{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; + dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; // CUDA grid dimensions - dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), - 1, 1}; + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; // Assign observation vectors to code 0 CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); // Choose first centroid - thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n), 1); + thrust::fill(thrust_exec_policy, + thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + 1); CHECK_CUDA(stream); - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids)) + if (chooseNewCentroid( + handle, thrust_exec_policy, n, d, k, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); - computeDistances<<>>( - n, d, 1, obs, centroids, dists); + computeDistances<<>>(n, d, 1, obs, centroids, dists); CHECK_CUDA(stream); // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), - obs, dists, centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, + thrust_exec_policy, + n, + d, + k, + uniformDist(rng), + obs, + dists, + centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid @@ -510,22 +533,20 @@ static int initializeCentroids( CHECK_CUDA(stream); // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, - codes, i); + minDistances2<<>>(n, dists, dists + n, codes, i); CHECK_CUDA(stream); } // Compute cluster sizes CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - computeClusterSizes<<>>(n, k, codes, - clusterSizes); + computeClusterSizes<<>>(n, k, codes, clusterSizes); CHECK_CUDA(stream); return 0; } -/** - * @brief Find cluster centroids closest to observation vectors. +/** + * @brief Find cluster centroids closest to observation vectors. * Distance is measured with Euclidean norm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -553,16 +574,21 @@ static int initializeCentroids( * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template -static int assignCentroids( - handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) { +template +static int assignCentroids(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) +{ auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Compute distance between centroids and observation vectors CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); @@ -574,11 +600,9 @@ static int assignCentroids( constexpr index_type_t grid_lower_bound{65535}; gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); gridDim.y = min(k, grid_lower_bound); - gridDim.z = - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); - computeDistances<<>>(n, d, k, obs, centroids, - dists); + computeDistances<<>>(n, d, k, obs, centroids, dists); CHECK_CUDA(stream); // Find centroid closest to each observation vector @@ -586,23 +610,21 @@ static int assignCentroids( blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, - clusterSizes); + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, clusterSizes); CHECK_CUDA(stream); // Compute residual sum of squares - *residual_host = - thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n)); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); return 0; } -/** - * @brief Update cluster centroids for k-means algorithm. +/** + * @brief Update cluster centroids for k-means algorithm. * All clusters are assumed to be non-empty. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -628,29 +650,31 @@ static int assignCentroids( * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template +template static int updateCentroids(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, index_type_t n, - index_type_t d, index_type_t k, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, const value_type_t* __restrict__ obs, const index_type_t* __restrict__ codes, const index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, value_type_t* __restrict__ work, - index_type_t* __restrict__ work_int) { + index_type_t* __restrict__ work_int) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Useful constants - const value_type_t one = 1; + const value_type_t one = 1; const value_type_t zero = 0; constexpr index_type_t grid_lower_bound{65535}; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Device memory thrust::device_ptr obs_copy(work); @@ -658,34 +682,56 @@ static int updateCentroids(handle_t const& handle, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, - d, &zero, (value_type_t*)NULL, n, - thrust::raw_pointer_cast(obs_copy), n, stream)); + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (value_type_t*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, thrust::modulus()); CHECK_CUDA(stream); - thrust::gather(thrust_exec_policy, rows, rows + d * n, - thrust::device_pointer_cast(codes), codes_copy); + thrust::gather( + thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); CHECK_CUDA(stream); // Row associated with each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, thrust::divides()); CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster - thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, + thrust::stable_sort_by_key(thrust_exec_policy, + codes_copy, + codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); CHECK_CUDA(stream); - thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, + thrust::reduce_by_key(thrust_exec_policy, + rows, + rows + d * n, + obs_copy, codes_copy, // Output to codes_copy is ignored thrust::device_pointer_cast(centroids)); CHECK_CUDA(stream); @@ -696,12 +742,11 @@ static int updateCentroids(handle_t const& handle, dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; // CUDA grid dimensions - dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1}; + dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), + 1}; - divideCentroids<<>>(d, k, clusterSizes, - centroids); + divideCentroids<<>>(d, k, clusterSizes, centroids); CHECK_CUDA(stream); return 0; @@ -715,8 +760,8 @@ namespace raft { // k-means algorithm // ========================================================= -/** - * @brief Find clusters with k-means algorithm. +/** + * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. @@ -754,17 +799,24 @@ namespace raft { * @param seed random seed to be used. * @return error flag. */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, +template +int kmeans(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, + const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, - value_type_t* __restrict__ work, index_type_t* __restrict__ work_int, - value_type_t* residual_host, index_type_t* iters_host, - unsigned long long seed) { + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int, + value_type_t* residual_host, + index_type_t* iters_host, + unsigned long long seed) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -786,100 +838,120 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, // ------------------------------------------------------- auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Trivial cases if (k == 1) { CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), - cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + CUDA_TRY( + cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); + if (updateCentroids( + handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), - grid_lower_bound)}; + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)}; CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, - work); + computeDistances<<>>(n, d, 1, obs, centroids, work); CHECK_CUDA(stream); - *residual_host = - thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), - thrust::device_pointer_cast(work + n)); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); CHECK_CUDA(stream); return 0; } if (n <= k) { - thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), + thrust::sequence(thrust_exec_policy, + thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); CHECK_CUDA(stream); - thrust::fill_n(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), n, 1); + thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); CHECK_CUDA(stream); if (n < k) - CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, - (k - n) * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync( + centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return 0; } // Initialize cuBLAS - CUBLAS_CHECK( - linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - codes, clusterSizes, work, seed)) + if (initializeCentroids( + handle, thrust_exec_policy, n, d, k, obs, centroids, codes, clusterSizes, work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, - clusterSizes, centroids, work, work_int)) + if (updateCentroids( + handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, + thrust_exec_policy, + n, + d, + k, + obs, + centroids, + work, + codes, + clusterSizes, + residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids - index_type_t emptyCentroid = - (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), 0) - - thrust::device_pointer_cast(clusterSizes)); + index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); // FIXME: emptyCentroid never reaches k (infinite loop) under certain // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, - uniformDist(rng), obs, work, + if (chooseNewCentroid(handle, + thrust_exec_policy, + n, + d, + k, + uniformDist(rng), + obs, + work, centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, - work, codes, clusterSizes, residual_host)) + if (assignCentroids(handle, + thrust_exec_policy, + n, + d, + k, + obs, + centroids, + work, + codes, + clusterSizes, + residual_host)) WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = - (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), 0) - - thrust::device_pointer_cast(clusterSizes)); + emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); CHECK_CUDA(stream); } @@ -891,14 +963,13 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, } // Warning if k-means has failed to converge - if (std::fabs(residualPrev - (*residual_host)) / n >= tol) - WARNING("k-means failed to converge"); + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); *iters_host = iter; return 0; } -/** +/** * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with @@ -926,13 +997,20 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, * @param seed random seed to be used. * @return error flag */ -template -int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, index_type_t d, index_type_t k, value_type_t tol, - index_type_t maxiter, const value_type_t* __restrict__ obs, - index_type_t* __restrict__ codes, value_type_t& residual, - index_type_t& iters, unsigned long long seed = 123456) { +template +int kmeans(handle_t const& handle, + thrust_exe_pol_t thrust_exec_policy, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, + const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, + value_type_t& residual, + index_type_t& iters, + unsigned long long seed = 123456) +{ using namespace matrix; // Check that parameters are valid @@ -949,10 +1027,22 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, vector_t work_int(handle, 2 * d * n); // Perform k-means - return kmeans( - handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, - clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, - &iters, seed); + return kmeans(handle, + thrust_exec_policy, + n, + d, + k, + tol, + maxiter, + obs, + codes, + clusterSizes.raw(), + centroids.raw(), + work.raw(), + work_int.raw(), + &residual, + &iters, + seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index d14bf05f37..35fc22c770 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -21,66 +21,125 @@ #include #include -//for now; TODO: check if/where this `define` should be; +// for now; TODO: check if/where this `define` should be; // #define USE_LAPACK namespace raft { -#define lapackCheckError(status) \ - { \ - if (status < 0) { \ - std::stringstream ss; \ - ss << "Lapack error: argument number " << -status \ - << " had an illegal value."; \ - throw exception(ss.str()); \ - } else if (status > 0) \ - RAFT_FAIL("Lapack error: internal error."); \ +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status << " had an illegal value."; \ + throw exception(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ } -extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, - float *work, int *lwork, int *info); -extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, - double *work, int *lwork, int *info); -extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, - float *a, int *lda, const float *tau, float *c, - int *ldc, float *work, int *lwork, int *info); -extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, - double *a, int *lda, const double *tau, double *c, - int *ldc, double *work, int *lwork, int *info); -extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, - double *wr, double *wi, double *vl, int *ldvl, double *vr, - int *ldvr, double *work, int *lwork, int *info); - -extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, - float *wr, float *wi, float *vl, int *ldvl, float *vr, - int *ldvr, float *work, int *lwork, int *info); - -extern "C" cusolverStatus_t cusolverDnSgemmHost( - cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const float *alpha, const float *A, int lda, const float *B, int ldb, - const float *beta, float *C, int ldc); - -extern "C" cusolverStatus_t cusolverDnDgemmHost( - cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const double *alpha, const double *A, int lda, const double *B, int ldb, - const double *beta, double *C, int ldc); - -extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e, - int *info); - -extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e, - int *info); - -extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz, - int n, float *d, float *e, - float *z, int ldz, float *work, - int *info); - -extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz, - int n, double *d, double *e, - double *z, int ldz, - double *work, int *info); +extern "C" void sgeqrf_( + int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info); +extern "C" void dgeqrf_( + int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info); +extern "C" void sormqr_(char* side, + char* trans, + int* m, + int* n, + int* k, + float* a, + int* lda, + const float* tau, + float* c, + int* ldc, + float* work, + int* lwork, + int* info); +extern "C" void dormqr_(char* side, + char* trans, + int* m, + int* n, + int* k, + double* a, + int* lda, + const double* tau, + double* c, + int* ldc, + double* work, + int* lwork, + int* info); +extern "C" int dgeev_(char* jobvl, + char* jobvr, + int* n, + double* a, + int* lda, + double* wr, + double* wi, + double* vl, + int* ldvl, + double* vr, + int* ldvr, + double* work, + int* lwork, + int* info); + +extern "C" int sgeev_(char* jobvl, + char* jobvr, + int* n, + float* a, + int* lda, + float* wr, + float* wi, + float* vl, + int* ldvl, + float* vr, + int* ldvr, + float* work, + int* lwork, + int* info); + +extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc); + +extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc); + +extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info); + +extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info); + +extern "C" cusolverStatus_t cusolverDnSsteqrHost( + const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info); + +extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz, + int n, + double* d, + double* e, + double* z, + int ldz, + double* work, + int* info); template class Lapack { @@ -91,182 +150,339 @@ class Lapack { public: static void check_lapack_enabled(); - static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, - const T *A, int lda, const T *B, int ldb, T beta, T *C, + static void gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T* A, + int lda, + const T* B, + int ldb, + T beta, + T* C, int ldc); // special QR for lanczos - static void sterf(int n, T *d, T *e); - static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work); + static void sterf(int n, T* d, T* e); + static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work); // QR // computes the QR factorization of a general matrix - static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); + static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork); // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. // multiply C by implicit Q - static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, - int lda, T *tau, T *c, int ldc, T *work, int *lwork); - - static void geev(T *A, T *eigenvalues, int dim, int lda); - static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + static void ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T* a, + int lda, + T* tau, + T* c, + int ldc, + T* work, + int* lwork); + + static void geev(T* A, T* eigenvalues, int dim, int lda); + static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); + static void geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, int ldvr); - static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r, - T *eigenvectors_i, int dim, int lda, int ldvr); private: - static void lapack_gemm(const char transa, const char transb, int m, int n, - int k, float alpha, const float *a, int lda, - const float *b, int ldb, float beta, float *c, - int ldc) { - cublasOperation_t cublas_transa = - (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = - (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, - (float *)a, lda, (float *)b, ldb, &beta, c, ldc); + static void lapack_gemm(const char transa, + const char transb, + int m, + int n, + int k, + float alpha, + const float* a, + int lda, + const float* b, + int ldb, + float beta, + float* c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost( + cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc); } - static void lapack_gemm(const signed char transa, const signed char transb, - int m, int n, int k, double alpha, const double *a, - int lda, const double *b, int ldb, double beta, - double *c, int ldc) { - cublasOperation_t cublas_transa = - (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = - (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, - (double *)a, lda, (double *)b, ldb, &beta, c, ldc); + static void lapack_gemm(const signed char transa, + const signed char transb, + int m, + int n, + int k, + double alpha, + const double* a, + int lda, + const double* b, + int ldb, + double beta, + double* c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, + cublas_transb, + m, + n, + k, + &alpha, + (double*)a, + lda, + (double*)b, + ldb, + &beta, + c, + ldc); } - static void lapack_sterf(int n, float *d, float *e, int *info) { + static void lapack_sterf(int n, float* d, float* e, int* info) + { cusolverDnSsterfHost(n, d, e, info); } - static void lapack_sterf(int n, double *d, double *e, int *info) { + static void lapack_sterf(int n, double* d, double* e, int* info) + { cusolverDnDsterfHost(n, d, e, info); } - static void lapack_steqr(const signed char compz, int n, float *d, float *e, - float *z, int ldz, float *work, int *info) { + static void lapack_steqr( + const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info) + { cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_steqr(const signed char compz, int n, double *d, double *e, - double *z, int ldz, double *work, int *info) { + static void lapack_steqr(const signed char compz, + int n, + double* d, + double* e, + double* z, + int ldz, + double* work, + int* info) + { cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, - float *work, int *lwork, int *info) { + static void lapack_geqrf( + int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info) + { sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, - double *work, int *lwork, int *info) { + static void lapack_geqrf( + int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info) + { dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a, - int lda, float *tau, float *c, int ldc, float *work, - int *lwork, int *info) { - sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, - info); + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + float* a, + int lda, + float* tau, + float* c, + int ldc, + float* work, + int* lwork, + int* info) + { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); } - static void lapack_ormqr(char side, char trans, int m, int n, int k, - double *a, int lda, double *tau, double *c, int ldc, - double *work, int *lwork, int *info) { - dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, - info); + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + double* a, + int lda, + double* tau, + double* c, + int ldc, + double* work, + int* lwork, + int* info) + { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); } - static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a, - int *lda, double *wr, double *wi, double *vl, - int *ldvl, double *vr, int *ldvr, - double *work, int *lwork, int *info) { - return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, - lwork, info); + static int lapack_geev_dispatch(char* jobvl, + char* jobvr, + int* n, + double* a, + int* lda, + double* wr, + double* wi, + double* vl, + int* ldvl, + double* vr, + int* ldvr, + double* work, + int* lwork, + int* info) + { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); } - static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a, - int *lda, float *wr, float *wi, float *vl, - int *ldvl, float *vr, int *ldvr, float *work, - int *lwork, int *info) { - return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, - lwork, info); + static int lapack_geev_dispatch(char* jobvl, + char* jobvr, + int* n, + float* a, + int* lda, + float* wr, + float* wi, + float* vl, + int* ldvl, + float* vr, + int* ldvr, + float* work, + int* lwork, + int* info) + { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); } // real eigenvalues - static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) { + static void lapack_geev(T* A, T* eigenvalues, int dim, int lda) + { char job = 'N'; std::vector WI(dim); - int ldv = 1; - T *vl = 0; + int ldv = 1; + T* vl = 0; int work_size = 6 * dim; std::vector work(work_size); int info; - lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl, - &ldv, vl, &ldv, work.data(), &work_size, &info); + lapack_geev_dispatch(&job, + &job, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldv, + vl, + &ldv, + work.data(), + &work_size, + &info); lapackCheckError(info); } // real eigenpairs - static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, - int lda, int ldvr) { + static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) + { char jobvl = 'N'; char jobvr = 'V'; std::vector WI(dim); int work_size = 6 * dim; - T *vl = 0; - int ldvl = 1; + T* vl = 0; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(), - vl, &ldvl, eigenvectors, &ldvr, work.data(), - &work_size, &info); + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldvl, + eigenvectors, + &ldvr, + work.data(), + &work_size, + &info); lapackCheckError(info); } // complex eigenpairs - static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i, - T *eigenvectors_r, T *eigenvectors_i, int dim, - int lda, int ldvr) { - char jobvl = 'N'; - char jobvr = 'V'; + static void lapack_geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, + int ldvr) + { + char jobvl = 'N'; + char jobvr = 'V'; int work_size = 8 * dim; - int ldvl = 1; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r, - eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr, - work.data(), &work_size, &info); + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues_r, + eigenvalues_i, + 0, + &ldvl, + eigenvectors_r, + &ldvr, + work.data(), + &work_size, + &info); lapackCheckError(info); } }; template -void Lapack::check_lapack_enabled() { +void Lapack::check_lapack_enabled() +{ #ifndef USE_LAPACK RAFT_FAIL("Error: LAPACK not enabled."); #endif } template -void Lapack::gemm(bool transa, bool transb, int m, int n, int k, T alpha, - const T *A, int lda, const T *B, int ldb, T beta, T *C, - int ldc) { +void Lapack::gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T* A, + int lda, + const T* B, + int ldb, + T beta, + T* C, + int ldc) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK const char transA_char = transa ? 'T' : 'N'; const char transB_char = transb ? 'T' : 'N'; - lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, - ldc); + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); //#endif } template -void Lapack::sterf(int n, T *d, T *e) { +void Lapack::sterf(int n, T* d, T* e) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -276,7 +492,8 @@ void Lapack::sterf(int n, T *d, T *e) { } template -void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { +void Lapack::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -286,8 +503,8 @@ void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { } template -void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, - int *lwork) { +void Lapack::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork) +{ check_lapack_enabled(); #ifdef USE_LAPACK int info; @@ -296,11 +513,22 @@ void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, #endif } template -void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, - int lda, T *tau, T *c, int ldc, T *work, int *lwork) { +void Lapack::ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T* a, + int lda, + T* tau, + T* c, + int ldc, + T* work, + int* lwork) +{ check_lapack_enabled(); #ifdef USE_LAPACK - char side = right_side ? 'R' : 'L'; + char side = right_side ? 'R' : 'L'; char trans = transq ? 'T' : 'N'; int info; lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); @@ -310,7 +538,8 @@ void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, // real eigenvalues template -void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { +void Lapack::geev(T* A, T* eigenvalues, int dim, int lda) +{ check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, dim, lda); @@ -318,8 +547,8 @@ void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { } // real eigenpairs template -void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, - int ldvr) { +void Lapack::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) +{ check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); @@ -327,13 +556,18 @@ void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, } // complex eigenpairs template -void Lapack::geev(T *A, T *eigenvalues_r, T *eigenvalues_i, - T *eigenvectors_r, T *eigenvectors_i, int dim, int lda, - int ldvr) { +void Lapack::geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, + int ldvr) +{ check_lapack_enabled(); #ifdef USE_LAPACK - lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, - dim, lda, ldvr); + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); #endif } diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index c43154d17a..89d2b7e8ec 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -40,10 +40,12 @@ using size_type = int; // for now; TODO: move it in appropriate header // Apply diagonal matrix to vector: // template -static __global__ void diagmv(IndexType_ n, ValueType_ alpha, +static __global__ void diagmv(IndexType_ n, + ValueType_ alpha, const ValueType_* __restrict__ D, const ValueType_* __restrict__ x, - ValueType_* __restrict__ y) { + ValueType_* __restrict__ y) +{ IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { y[i] += alpha * D[i] * x[i]; @@ -58,7 +60,7 @@ enum struct sparse_mv_alg_t : int { SPARSE_MV_UNDEFINED = -1, SPARSE_MV_ALG_DEFAULT, // generic, for any sparse matrix SPARSE_MV_ALG1, // typical for CSR - SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices + SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices }; // Vector "view"-like aggregate for linear algebra purposes @@ -68,21 +70,21 @@ struct vector_view_t { value_type* buffer_; size_type size_; - vector_view_t(value_type* buffer, size_type sz) - : buffer_(buffer), size_(sz) {} + vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {} - vector_view_t(vector_view_t&& other) - : buffer_(other.buffer_), size_(other.size_) { + vector_view_t(vector_view_t&& other) : buffer_(other.buffer_), size_(other.size_) + { other.buffer_ = nullptr; - other.size_ = 0; + other.size_ = 0; } - vector_view_t& operator=(vector_view_t&& other) { + vector_view_t& operator=(vector_view_t&& other) + { buffer_ = other.buffer_; - size_ = other.size_; + size_ = other.size_; other.buffer_ = nullptr; - other.size_ = 0; + other.size_ = 0; } }; @@ -98,15 +100,16 @@ class vector_t { public: vector_t(handle_t const& raft_handle, size_type sz) : handle_(raft_handle), - buffer_( - static_cast(raft_handle.get_device_allocator()->allocate( - sz * sizeof(value_type), raft_handle.get_stream()))), + buffer_(static_cast(raft_handle.get_device_allocator()->allocate( + sz * sizeof(value_type), raft_handle.get_stream()))), size_(sz), - stream_(raft_handle.get_stream()) {} + stream_(raft_handle.get_stream()) + { + } - ~vector_t(void) { - handle_.get_device_allocator()->deallocate( - buffer_, size_ * sizeof(value_type), stream_); + ~vector_t(void) + { + handle_.get_device_allocator()->deallocate(buffer_, size_ * sizeof(value_type), stream_); } size_type size(void) const { return size_; } @@ -116,26 +119,31 @@ class vector_t { value_type const* raw(void) const { return buffer_; } template - value_type nrm1(ThrustExecPolicy t_exe_pol) const { - return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, - [] __device__(auto left, auto right) { - auto abs_left = left > 0 ? left : -left; - auto abs_right = right > 0 ? right : -right; - return abs_left + abs_right; - }); + value_type nrm1(ThrustExecPolicy t_exe_pol) const + { + return thrust::reduce( + t_exe_pol, buffer_, buffer_ + size_, value_type{0}, [] __device__(auto left, auto right) { + auto abs_left = left > 0 ? left : -left; + auto abs_right = right > 0 ? right : -right; + return abs_left + abs_right; + }); } template - void fill(ThrustExecPolicy t_exe_pol, value_type value) { + void fill(ThrustExecPolicy t_exe_pol, value_type value) + { thrust::fill_n(t_exe_pol, buffer_, size_, value); } }; template struct sparse_matrix_t { - sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const ncols, + sparse_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const ncols, index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), @@ -143,18 +151,25 @@ struct sparse_matrix_t { values_(values), nrows_(nrows), ncols_(ncols), - nnz_(nnz) {} + nnz_(nnz) + { + } - sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) + sparse_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), col_indices_(col_indices), values_(values), nrows_(nrows), ncols_(nrows), - nnz_(nnz) {} + nnz_(nnz) + { + } template sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view) @@ -164,7 +179,9 @@ struct sparse_matrix_t { values_(csr_view.edge_data), nrows_(csr_view.number_of_vertices), ncols_(csr_view.number_of_vertices), - nnz_(csr_view.number_of_edges) {} + nnz_(csr_view.number_of_edges) + { + } virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types @@ -174,21 +191,24 @@ struct sparse_matrix_t { // descriptor creation works with non-const, and const-casting // down is dangerous) // - virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + virtual void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const { + bool transpose = false, + bool symmetric = false) const + { using namespace sparse; RAFT_EXPECTS(x != nullptr, "Null x buffer."); RAFT_EXPECTS(y != nullptr, "Null y buffer."); auto cusparse_h = handle_.get_cusparse_handle(); - auto stream = handle_.get_stream(); + auto stream = handle_.get_stream(); - cusparseOperation_t trans = - transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose - CUSPARSE_OPERATION_NON_TRANSPOSE; //non-transpose + cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose + CUSPARSE_OPERATION_NON_TRANSPOSE; // non-transpose #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP auto size_x = transpose ? nrows_ : ncols_; @@ -196,15 +216,19 @@ struct sparse_matrix_t { cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg); - //create descriptors: + // create descriptors: //(below casts are necessary, because // cusparseCreateCsr(...) takes non-const // void*; the casts should be harmless) // cusparseSpMatDescr_t matA; - CUSPARSE_CHECK(cusparsecreatecsr( - &matA, nrows_, ncols_, nnz_, const_cast(row_offsets_), - const_cast(col_indices_), const_cast(values_))); + CUSPARSE_CHECK(cusparsecreatecsr(&matA, + nrows_, + ncols_, + nnz_, + const_cast(row_offsets_), + const_cast(col_indices_), + const_cast(values_))); cusparseDnVecDescr_t vecX; CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x)); @@ -212,31 +236,29 @@ struct sparse_matrix_t { cusparseDnVecDescr_t vecY; CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y)); - //get (scratch) external device buffer size: + // get (scratch) external device buffer size: // size_t bufferSize; - CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA, - vecX, &beta, vecY, spmv_alg, - &bufferSize, stream)); + CUSPARSE_CHECK(cusparsespmv_buffersize( + cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream)); - //allocate external buffer: + // allocate external buffer: // vector_t external_buffer(handle_, bufferSize); - //finally perform SpMV: + // finally perform SpMV: // - CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta, - vecY, spmv_alg, external_buffer.raw(), stream)); + CUSPARSE_CHECK(cusparsespmv( + cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream)); - //free descriptors: + // free descriptors: //(TODO: maybe wrap them in a RAII struct?) // CUSPARSE_CHECK(cusparseDestroyDnVec(vecY)); CUSPARSE_CHECK(cusparseDestroyDnVec(vecX)); CUSPARSE_CHECK(cusparseDestroySpMat(matA)); #else - CUSPARSE_CHECK( - cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); + CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); cusparseMatDescr_t descr = 0; CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); if (symmetric) { @@ -245,9 +267,20 @@ struct sparse_matrix_t { CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); } CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_, - &alpha, descr, values_, row_offsets_, - col_indices_, x, &beta, y, stream)); + CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, + trans, + nrows_, + ncols_, + nnz_, + &alpha, + descr, + values_, + row_offsets_, + col_indices_, + x, + &beta, + y, + stream)); CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); #endif } @@ -255,19 +288,18 @@ struct sparse_matrix_t { handle_t const& get_handle(void) const { return handle_; } #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP - cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const { + cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const + { switch (alg) { - case sparse_mv_alg_t::SPARSE_MV_ALG1: - return CUSPARSE_CSRMV_ALG1; - case sparse_mv_alg_t::SPARSE_MV_ALG2: - return CUSPARSE_CSRMV_ALG2; - default: - return CUSPARSE_MV_ALG_DEFAULT; + case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_CSRMV_ALG1; + case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_CSRMV_ALG2; + default: return CUSPARSE_MV_ALG_DEFAULT; } } #endif - //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate + // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, + // aggregate handle_t const& handle_; index_type const* row_offsets_; @@ -284,44 +316,51 @@ struct laplacian_matrix_t : sparse_matrix_t { laplacian_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) - : sparse_matrix_t(raft_handle, row_offsets, - col_indices, values, nrows, nnz), - diagonal_(raft_handle, nrows) { + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) + : sparse_matrix_t( + raft_handle, row_offsets, col_indices, values, nrows, nnz), + diagonal_(raft_handle, nrows) + { vector_t ones{raft_handle, nrows}; ones.fill(thrust_exec_policy, 1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, - diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } template laplacian_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) - : sparse_matrix_t(raft_handle, csr_m.row_offsets_, - csr_m.col_indices_, csr_m.values_, - csr_m.nrows_, csr_m.nnz_), - diagonal_(raft_handle, csr_m.nrows_) { + : sparse_matrix_t(raft_handle, + csr_m.row_offsets_, + csr_m.col_indices_, + csr_m.values_, + csr_m.nrows_, + csr_m.nnz_), + diagonal_(raft_handle, csr_m.nrows_) + { vector_t ones{raft_handle, csr_m.nrows_}; ones.fill(thrust_exec_policy, 1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, - diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const override { + bool transpose = false, + bool symmetric = false) const override + { constexpr int BLOCK_SIZE = 1024; - auto n = sparse_matrix_t::nrows_; + auto n = sparse_matrix_t::nrows_; - auto cublas_h = - sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = - sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = sparse_matrix_t::get_handle().get_stream(); // scales y by beta: // @@ -333,8 +372,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply diagonal matrix // - dim3 gridDim{ - std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + dim3 gridDim{std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; dim3 blockDim{BLOCK_SIZE, 1, 1}; diagmv<<>>(n, alpha, diagonal_.raw(), x, y); @@ -342,8 +380,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply adjacency matrix // - sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, - symmetric); + sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, symmetric); } vector_t diagonal_; @@ -355,58 +392,68 @@ struct modularity_matrix_t : laplacian_matrix_t { modularity_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) : laplacian_matrix_t( - raft_handle, thrust_exec_policy, row_offsets, col_indices, values, - nrows, nnz) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + raft_handle, thrust_exec_policy, row_offsets, col_indices, values, nrows, nnz) + { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(thrust_exec_policy); } template modularity_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) - : laplacian_matrix_t(raft_handle, - thrust_exec_policy, csr_m) { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( - thrust_exec_policy); + : laplacian_matrix_t(raft_handle, thrust_exec_policy, csr_m) + { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(thrust_exec_policy); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const override { + bool transpose = false, + bool symmetric = false) const override + { auto n = sparse_matrix_t::nrows_; - auto cublas_h = - sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = - sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = sparse_matrix_t::get_handle().get_stream(); // y = A*x // - sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, - symmetric); + sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, symmetric); value_type dot_res; // gamma = d'*x // // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - CUBLAS_CHECK(linalg::cublasdot( - cublas_h, n, laplacian_matrix_t::diagonal_.raw(), - 1, x, 1, &dot_res, stream)); + CUBLAS_CHECK(linalg::cublasdot(cublas_h, + n, + laplacian_matrix_t::diagonal_.raw(), + 1, + x, + 1, + &dot_res, + stream)); // y = y -(gamma/edge_sum)*d // value_type gamma_ = -dot_res / edge_sum_; - CUBLAS_CHECK(linalg::cublasaxpy( - cublas_h, n, &gamma_, - laplacian_matrix_t::diagonal_.raw(), 1, y, 1, - stream)); + CUBLAS_CHECK(linalg::cublasaxpy(cublas_h, + n, + &gamma_, + laplacian_matrix_t::diagonal_.raw(), + 1, + y, + 1, + stream)); } value_type edge_sum_; diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index f8dfe5daa3..bb7087a3be 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -40,7 +40,8 @@ #endif #ifdef COLLECT_TIME_STATISTICS -static double timer(void) { +static double timer(void) +{ struct timeval tv; cudaDeviceSynchronize(); gettimeofday(&tv, NULL); @@ -79,19 +80,27 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, - EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { + handle_t const& handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); std::tuple stats; // # iters eigen solver, cluster solver residual, # iters cluster solver @@ -104,11 +113,10 @@ std::tuple modularity_maximization( modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute eigenvectors corresponding to largest eigenvalues - std::get<0>(stats) = - eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); + std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); @@ -119,8 +127,8 @@ std::tuple modularity_maximization( CHECK_CUDA(stream); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -139,12 +147,13 @@ std::tuple modularity_maximization( * @param modularity On exit, modularity */ template -void analyzeModularity(handle_t const &handle, +void analyzeModularity(handle_t const& handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, + sparse_matrix_t const& csr_m, vertex_t nClusters, - vertex_t const *__restrict__ clusters, - weight_t &modularity) { + vertex_t const* __restrict__ clusters, + weight_t& modularity) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; @@ -152,15 +161,14 @@ void analyzeModularity(handle_t const &handle, weight_t partModularity, clustersize; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Device memory vector_t part_i(handle, n); vector_t Bx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; @@ -170,8 +178,8 @@ void analyzeModularity(handle_t const &handle, // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partModularity, clusters, part_i, Bx, B)) { + if (!construct_indicator( + handle, thrust_exec_policy, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 841fca04d9..e2576c1d69 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -62,22 +62,30 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template -std::tuple partition( - handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, - EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { +template +std::tuple partition(handle_t const& handle, + ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); std::tuple - stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, + //cluster solver residual, # iters cluster solver vertex_t n = csr_m.nrows_; @@ -88,22 +96,21 @@ std::tuple partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - ///sparse_matrix_t A{handle, graph}; + /// sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute smallest eigenvalues and eigenvectors - std::get<0>(stats) = - eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); + std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, - nEigVecs, eigVecs, clusters); + auto pair_cluster = + cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -130,18 +137,21 @@ std::tuple partition( * @return error flag. */ template -void analyzePartition(handle_t const &handle, +void analyzePartition(handle_t const& handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const &csr_m, - vertex_t nClusters, const vertex_t *__restrict__ clusters, - weight_t &edgeCut, weight_t &cost) { + sparse_matrix_t const& csr_m, + vertex_t nClusters, + const vertex_t* __restrict__ clusters, + weight_t& edgeCut, + weight_t& cost) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; vertex_t n = csr_m.nrows_; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); weight_t partEdgesCut, clustersize; @@ -150,22 +160,21 @@ void analyzePartition(handle_t const &handle, vector_t Lx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian - ///sparse_matrix_t A{handle, graph}; + /// sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; // Initialize output - cost = 0; + cost = 0; edgeCut = 0; // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, - partEdgesCut, clusters, part_i, Lx, L)) { + if (!construct_indicator( + handle, thrust_exec_policy, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 40dde30a74..5349cb2810 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -28,20 +28,18 @@ namespace raft { namespace spectral { template -static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, - value_type_t* obs) { +static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs) +{ index_type_t i, j, k, index, mm; value_type_t alpha, v, last; bool valid; // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * - blockDim.x); // m in multiple of blockDim.x + mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x alpha = 0.0; - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < mm; i += blockDim.x) { // check if the thread is valid valid = i < m; @@ -66,17 +64,17 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, // scale by alpha alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x); alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; + index = i + j * m; obs[index] = obs[index] / alpha; } } } template -index_type_t next_pow2(index_type_t n) { +index_type_t next_pow2(index_type_t n) +{ index_type_t v; // Reference: // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float @@ -90,7 +88,8 @@ index_type_t next_pow2(index_type_t n) { } template -cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { +cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) +{ index_type_t p2m; // find next power of 2 @@ -102,19 +101,20 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1}; // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel - <<>>(m, n, obs); + scale_obs_kernel<<>>(m, n, obs); return cudaSuccess; } -template +template void transform_eigen_matrix(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t n, - vertex_t nEigVecs, weight_t* eigVecs) { + ThrustExePolicy thrust_exec_policy, + edge_t n, + vertex_t nEigVecs, + weight_t* eigVecs) +{ auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); const weight_t zero{0.0}; const weight_t one{1.0}; @@ -123,9 +123,9 @@ void transform_eigen_matrix(handle_t const& handle, for (auto i = 0; i < nEigVecs; ++i) { weight_t mean, std; - mean = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + mean = thrust::reduce(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); CHECK_CUDA(stream); mean /= n; thrust::transform(thrust_exec_policy, @@ -136,8 +136,7 @@ void transform_eigen_matrix(handle_t const& handle, thrust::minus()); CHECK_CUDA(stream); - CUBLAS_CHECK( - cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); std /= std::sqrt(static_cast(n)); @@ -154,16 +153,25 @@ void transform_eigen_matrix(handle_t const& handle, // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, - &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, - work.raw(), nEigVecs, stream)); - - CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), - nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); + + CUDA_TRY(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); } } @@ -178,49 +186,48 @@ struct equal_to_i_op { public: equal_to_i_op(index_type_t _i) : i(_i) {} template - __host__ __device__ void operator()(Tuple_ t) { - thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; } }; } // namespace // Construct indicator vector for ith partition // -template +template bool construct_indicator(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, edge_t index, - edge_t n, weight_t& clustersize, weight_t& partStats, + ThrustExePolicy thrust_exec_policy, + edge_t index, + edge_t n, + weight_t& clustersize, + weight_t& partStats, vertex_t const* __restrict__ clusters, - vector_t& part_i, vector_t& Bx, - laplacian_matrix_t const& B) { + vector_t& part_i, + vector_t& Bx, + laplacian_matrix_t const& B) +{ auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - thrust::for_each(thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(index)); + auto stream = handle.get_stream(); + + thrust::for_each( + thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(index)); CHECK_CUDA(stream); // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &clustersize, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); clustersize = round(clustersize); - if (clustersize < 0.5) { - return false; - } + if (clustersize < 0.5) { return false; } // Compute part stats B.mv(1, part_i.raw(), 0, Bx.raw()); - CUBLAS_CHECK( - cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); return true; } diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp index 406f1b7c7e..08a4e6efb5 100644 --- a/cpp/include/raft/spectral/warn_dbg.hpp +++ b/cpp/include/raft/spectral/warn_dbg.hpp @@ -4,13 +4,13 @@ #include #define STRINGIFY_DETAIL(x) #x -#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) +#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) #ifdef DEBUG #define COUT() (std::cout) #define CERR() (std::cerr) -//nope: +// nope: // #define WARNING(message) \ do { \ diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh index 8691cabc85..4d6724482c 100644 --- a/cpp/include/raft/stats/mean.cuh +++ b/cpp/include/raft/stats/mean.cuh @@ -26,15 +26,15 @@ namespace stats { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -46,8 +46,8 @@ __global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, } template -__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -57,9 +57,7 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - mu[blockIdx.x] = acc / N; - } + if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; } } /** @@ -80,24 +78,22 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, * @param stream: cuda stream */ template -void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, - bool rowMajor, cudaStream_t stream) { +void mean( + Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); - meanKernelRowMajor - <<>>(mu, data, D, N); + meanKernelRowMajor<<>>(mu, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); } else { - meanKernelColMajor - <<>>(mu, data, D, N); + meanKernelColMajor<<>>(mu, data, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh index 04934d4388..c0ba24312b 100644 --- a/cpp/include/raft/stats/mean_center.cuh +++ b/cpp/include/raft/stats/mean_center.cuh @@ -38,12 +38,25 @@ namespace stats { * @param stream cuda stream where to launch work */ template -void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void meanCenter(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, stream); + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); } /** @@ -61,11 +74,25 @@ void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, * @param stream cuda stream where to launch work */ template -void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, cudaStream_t stream) { +void meanAdd(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, stream); + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); } }; // end namespace stats diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh index f12c633829..1dd9cd56bc 100644 --- a/cpp/include/raft/stats/stddev.cuh +++ b/cpp/include/raft/stats/stddev.cuh @@ -26,15 +26,15 @@ namespace stats { ///@todo: ColPerBlk has been tested only for 32! template -__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, - IdxType N) { +__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) { Type val = (colId < D) ? data[i * D + colId] : Type(0); thread_data += val * val; @@ -48,41 +48,39 @@ __global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, } template -__global__ void stddevKernelColMajor(Type *std, const Type *data, - const Type *mu, IdxType D, IdxType N) { +__global__ void stddevKernelColMajor( + Type* std, const Type* data, const Type* mu, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - std[blockIdx.x] = raft::mySqrt(acc / N); - } + if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); } } template -__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, - IdxType D, IdxType N) { +__global__ void varsKernelColMajor( + Type* var, const Type* data, const Type* mu, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - var[blockIdx.x] = acc / N; - } + if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; } } /** @@ -104,28 +102,33 @@ __global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, * @param stream cuda stream where to launch work */ template -void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void stddev(Type* std, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D)); - stddevKernelRowMajor - <<>>(std, data, D, N); + stddevKernelRowMajor<<>>(std, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - std, std, mu, D, - [ratio] __device__(Type a, Type b) { - return raft::mySqrt(a * ratio - b * b); - }, + std, + std, + mu, + D, + [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); }, stream); } else { - stddevKernelColMajor - <<>>(std, data, mu, D, N); + stddevKernelColMajor<<>>(std, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } @@ -149,25 +152,28 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, * @param stream cuda stream where to launch work */ template -void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void vars(Type* var, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D)); - stddevKernelRowMajor - <<>>(var, data, D, N); + stddevKernelRowMajor<<>>(var, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - var, var, mu, D, - [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); + var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); } else { - varsKernelColMajor - <<>>(var, data, mu, D, N); + varsKernelColMajor<<>>(var, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh index 5f8416c7e2..c7b8ce12b6 100644 --- a/cpp/include/raft/stats/sum.cuh +++ b/cpp/include/raft/stats/sum.cuh @@ -26,15 +26,15 @@ namespace stats { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -46,8 +46,8 @@ __global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, } template -__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -57,9 +57,7 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - mu[blockIdx.x] = acc; - } + if (threadIdx.x == 0) { mu[blockIdx.x] = acc; } } /** @@ -77,21 +75,19 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, * @param stream cuda stream where to launch work */ template -void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, - cudaStream_t stream) { +void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D)); sumKernelRowMajor <<>>(output, input, D, N); } else { - sumKernelColMajor - <<>>(output, input, D, N); + sumKernelColMajor<<>>(output, input, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh index 1829fc0351..1e0885fb99 100644 --- a/cpp/include/raft/vectorized.cuh +++ b/cpp/include/raft/vectorized.cuh @@ -22,11 +22,11 @@ namespace raft { template -struct IOType {}; +struct IOType { +}; template <> struct IOType { - static_assert(sizeof(bool) == sizeof(int8_t), - "IOType bool size assumption failed"); + static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed"); typedef int8_t Type; }; template <> @@ -215,42 +215,42 @@ struct IOType { }; /** - * @struct TxN_t - * - * @brief Internal data structure that is used to define a facade for vectorized - * loads/stores across the most common POD types. The goal of his file is to - * provide with CUDA programmers, an easy way to have compiler issue vectorized - * load or store instructions to memory (either global or shared). Vectorized - * accesses to memory are important as they'll utilize its resources - * efficiently, - * when compared to their non-vectorized counterparts. Obviously, for whatever - * reasons if one is unable to issue such vectorized operations, one can always - * fallback to using POD types. - * - * Example demonstrating the use of load operations, performing math on such - * loaded data and finally storing it back. - * @code{.cu} - * TxN_t mydata1, mydata2; - * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; - * mydata1.load(ptr1, idx); - * mydata2.load(ptr2, idx); - * #pragma unroll - * for(int i=0;i type. - * Only change required is to replace variable declaration appropriately. - * - * Obviously, it's caller's responsibility to take care of pointer alignment! - * - * @tparam math_ the data-type in which the compute/math needs to happen - * @tparam veclen_ the number of 'math_' types to be loaded/stored per - * instruction - */ + * @struct TxN_t + * + * @brief Internal data structure that is used to define a facade for vectorized + * loads/stores across the most common POD types. The goal of his file is to + * provide with CUDA programmers, an easy way to have compiler issue vectorized + * load or store instructions to memory (either global or shared). Vectorized + * accesses to memory are important as they'll utilize its resources + * efficiently, + * when compared to their non-vectorized counterparts. Obviously, for whatever + * reasons if one is unable to issue such vectorized operations, one can always + * fallback to using POD types. + * + * Example demonstrating the use of load operations, performing math on such + * loaded data and finally storing it back. + * @code{.cu} + * TxN_t mydata1, mydata2; + * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; + * mydata1.load(ptr1, idx); + * mydata2.load(ptr2, idx); + * #pragma unroll + * for(int i=0;i type. + * Only change required is to replace variable declaration appropriately. + * + * Obviously, it's caller's responsibility to take care of pointer alignment! + * + * @tparam math_ the data-type in which the compute/math needs to happen + * @tparam veclen_ the number of 'math_' types to be loaded/stored per + * instruction + */ template struct TxN_t { /** underlying math data type */ @@ -274,7 +274,8 @@ struct TxN_t { * @brief Fill the contents of this structure with a constant value * @param _val the constant to be filled */ - DI void fill(math_t _val) { + DI void fill(math_t _val) + { #pragma unroll for (int i = 0; i < Ratio; ++i) { val.data[i] = _val; @@ -299,21 +300,24 @@ struct TxN_t { * @{ */ template - DI void load(const math_t *ptr, idx_t idx) { - const io_t *bptr = reinterpret_cast(&ptr[idx]); - val.internal = __ldg(bptr); + DI void load(const math_t* ptr, idx_t idx) + { + const io_t* bptr = reinterpret_cast(&ptr[idx]); + val.internal = __ldg(bptr); } template - DI void load(math_t *ptr, idx_t idx) { - io_t *bptr = reinterpret_cast(&ptr[idx]); + DI void load(math_t* ptr, idx_t idx) + { + io_t* bptr = reinterpret_cast(&ptr[idx]); val.internal = *bptr; } template - DI void store(math_t *ptr, idx_t idx) { - io_t *bptr = reinterpret_cast(&ptr[idx]); - *bptr = val.internal; + DI void store(math_t* ptr, idx_t idx) + { + io_t* bptr = reinterpret_cast(&ptr[idx]); + *bptr = val.internal; } /** @} */ }; @@ -330,11 +334,17 @@ struct TxN_t { DI void fill(math_t _val) {} template - DI void load(const math_t *ptr, idx_t idx) {} + DI void load(const math_t* ptr, idx_t idx) + { + } template - DI void load(math_t *ptr, idx_t idx) {} + DI void load(math_t* ptr, idx_t idx) + { + } template - DI void store(math_t *ptr, idx_t idx) {} + DI void store(math_t* ptr, idx_t idx) + { + } }; } // namespace raft diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 4ff6cdf5fa..284a873dec 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -23,7 +23,8 @@ namespace raft { -TEST(Raft, ClusterSolvers) { +TEST(Raft, ClusterSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -40,7 +41,7 @@ TEST(Raft, ClusterSolvers) { index_type d{10}; index_type k{5}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // value_type* eigvecs{nullptr}; index_type* codes{nullptr}; @@ -49,11 +50,11 @@ TEST(Raft, ClusterSolvers) { kmeans_solver_t cluster_solver{cfg}; - EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, - eigvecs, codes)); + EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, eigvecs, codes)); } -TEST(Raft, ModularitySolvers) { +TEST(Raft, ModularitySolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -68,7 +69,7 @@ TEST(Raft, ModularitySolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -82,21 +83,18 @@ TEST(Raft, ModularitySolvers) { index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, - seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); - sparse_matrix_t sm{h, nullptr, nullptr, - nullptr, 0, 0}; + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; auto t_exe_p = thrust::cuda::par.on(stream); EXPECT_ANY_THROW(spectral::modularity_maximization( h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type modularity{0}; - EXPECT_ANY_THROW( - spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); + EXPECT_ANY_THROW(spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); } } // namespace raft diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp index c14d880efd..150767992f 100644 --- a/cpp/test/cudart_utils.cpp +++ b/cpp/test/cudart_utils.cpp @@ -20,7 +20,8 @@ namespace raft { -TEST(Raft, Utils) { +TEST(Raft, Utils) +{ ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!")); ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception); ASSERT_THROW(THROW("Should throw!"), exception); diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index e2ed2c01dc..9ed32b80ef 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -25,30 +25,42 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceAdjKernel(bool *dist, const DataType *x, - const DataType *y, int m, int n, int k, - DataType eps, bool isRowMajor) { +__global__ void naiveDistanceAdjKernel(bool* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + DataType eps, + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc <= eps; } template -void naiveDistanceAdj(bool *dist, const DataType *x, const DataType *y, int m, - int n, int k, DataType eps, bool isRowMajor) { +void naiveDistanceAdj(bool* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + DataType eps, + bool isRowMajor) +{ static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); - naiveDistanceAdjKernel - <<>>(dist, x, y, m, n, k, eps, isRowMajor); + naiveDistanceAdjKernel<<>>(dist, x, y, m, n, k, eps, isRowMajor); CUDA_CHECK(cudaPeekAtLastError()); } @@ -61,21 +73,21 @@ struct DistanceAdjInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const DistanceAdjInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs& dims) +{ return os; } template -class DistanceAdjTest - : public ::testing::TestWithParam> { +class DistanceAdjTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; bool isRowMajor = params.isRowMajor; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -89,25 +101,23 @@ class DistanceAdjTest DataType threshold = params.eps; naiveDistanceAdj(dist_ref, x, y, m, n, k, threshold, isRowMajor); - char *workspace = nullptr; - size_t worksize = - raft::distance::getWorkspaceSize(x, y, m, n, k); - if (worksize != 0) { - raft::allocate(workspace, worksize); - } + char* workspace = nullptr; + size_t worksize = raft::distance:: + getWorkspaceSize( + x, y, m, n, k); + if (worksize != 0) { raft::allocate(workspace, worksize); } auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) { return d_val <= threshold; }; - raft::distance::distance( + raft::distance::distance( x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(workspace)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(x)); CUDA_CHECK(cudaFree(y)); CUDA_CHECK(cudaFree(dist_ref)); @@ -131,13 +141,13 @@ const std::vector> inputsf = { {10.0f, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestF; -TEST_P(DistanceAdjTestF, Result) { +TEST_P(DistanceAdjTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.01, 1024, 1024, 32, true, 1234ULL}, @@ -150,13 +160,13 @@ const std::vector> inputsd = { {10.0, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestD; -TEST_P(DistanceAdjTestD, Result) { +TEST_P(DistanceAdjTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd)); } // namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu index 10bc4d1899..c812a1985d 100644 --- a/cpp/test/distance/dist_canberra.cu +++ b/cpp/test/distance/dist_canberra.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceCanberra - : public DistanceTest {}; +class DistanceCanberra : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraF; -TEST_P(DistanceCanberraF, Result) { +TEST_P(DistanceCanberraF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraD; -TEST_P(DistanceCanberraD, Result) { +TEST_P(DistanceCanberraD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu index 6a2b02863a..0a4a69f059 100644 --- a/cpp/test/distance/dist_chebyshev.cu +++ b/cpp/test/distance/dist_chebyshev.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceLinf - : public DistanceTest {}; +class DistanceLinf : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfF; -TEST_P(DistanceLinfF, Result) { +TEST_P(DistanceLinfF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfD; -TEST_P(DistanceLinfD, Result) { +TEST_P(DistanceLinfD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu index 291c4196f9..f7510c17b1 100644 --- a/cpp/test/distance/dist_cos.cu +++ b/cpp/test/distance/dist_cos.cu @@ -21,9 +21,8 @@ namespace raft { namespace distance { template -class DistanceExpCos - : public DistanceTest {}; +class DistanceExpCos : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +35,13 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosF; -TEST_P(DistanceExpCosF, Result) { +TEST_P(DistanceExpCosF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +54,13 @@ const std::vector> inputsd = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosD; -TEST_P(DistanceExpCosD, Result) { +TEST_P(DistanceExpCosD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu index 46e7ded0ec..e90d0e83dc 100644 --- a/cpp/test/distance/dist_euc_exp.cu +++ b/cpp/test/distance/dist_euc_exp.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceEucExpTest - : public DistanceTest {}; +class DistanceEucExpTest : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,13 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestF; -TEST_P(DistanceEucExpTestF, Result) { +TEST_P(DistanceEucExpTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +54,13 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestD; -TEST_P(DistanceEucExpTestD, Result) { +TEST_P(DistanceEucExpTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu index 92f424647d..90412a9cb2 100644 --- a/cpp/test/distance/dist_euc_unexp.cu +++ b/cpp/test/distance/dist_euc_unexp.cu @@ -36,14 +36,13 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestF; -TEST_P(DistanceEucUnexpTestF, Result) { +TEST_P(DistanceEucUnexpTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +55,13 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestD; -TEST_P(DistanceEucUnexpTestD, Result) { +TEST_P(DistanceEucUnexpTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu index 39dc7aaeff..95b1908dc1 100644 --- a/cpp/test/distance/dist_hellinger.cu +++ b/cpp/test/distance/dist_hellinger.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceHellingerExp - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpF; -TEST_P(DistanceHellingerExpF, Result) { +TEST_P(DistanceHellingerExpF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpD; -TEST_P(DistanceHellingerExpD, Result) { +TEST_P(DistanceHellingerExpD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu index bd32837e45..d14f8d8a0b 100644 --- a/cpp/test/distance/dist_l1.cu +++ b/cpp/test/distance/dist_l1.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceUnexpL1 - : public DistanceTest {}; +class DistanceUnexpL1 : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1F; -TEST_P(DistanceUnexpL1F, Result) { +TEST_P(DistanceUnexpL1F, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1D; -TEST_P(DistanceUnexpL1D, Result) { +TEST_P(DistanceUnexpL1D, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu index 42b8e294ac..cc6a5f60de 100644 --- a/cpp/test/distance/dist_minkowski.cu +++ b/cpp/test/distance/dist_minkowski.cu @@ -21,8 +21,7 @@ namespace raft { namespace distance { template -class DistanceLpUnexp - : public DistanceTest { +class DistanceLpUnexp : public DistanceTest { }; const std::vector> inputsf = { @@ -36,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f}, }; typedef DistanceLpUnexp DistanceLpUnexpF; -TEST_P(DistanceLpUnexpF, Result) { +TEST_P(DistanceLpUnexpF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL, 4.0}, @@ -56,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0}, }; typedef DistanceLpUnexp DistanceLpUnexpD; -TEST_P(DistanceLpUnexpD, Result) { +TEST_P(DistanceLpUnexpD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index fc7b064205..a99d307abb 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -25,43 +25,52 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, int k, +__global__ void naiveDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, raft::distance::DistanceType type, - bool isRowMajor) { + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } if (type == raft::distance::DistanceType::L2SqrtExpanded || type == raft::distance::DistanceType::L2SqrtUnexpanded) acc = raft::mySqrt(acc); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveL1_Linf_CanberraDistanceKernel( - DataType *dist, const DataType *x, const DataType *y, int m, int n, int k, - raft::distance::DistanceType type, bool isRowMajor) { +__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + raft::distance::DistanceType type, + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = (a > b) ? (a - b) : (b - a); if (type == raft::distance::DistanceType::Linf) { acc = raft::myMax(acc, diff); @@ -75,29 +84,27 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel( } } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveCosineDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } - DataType acc_a = DataType(0); - DataType acc_b = DataType(0); + DataType acc_a = DataType(0); + DataType acc_b = DataType(0); DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_a += a * a; acc_b += b * b; acc_ab += a * b; @@ -106,64 +113,74 @@ __global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Use 1.0 - (cosine similarity) to calc the distance - dist[outidx] = - (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); + dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); } template -__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveHellingerDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_ab += raft::mySqrt(a) * raft::mySqrt(b); } int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - acc_ab = 1 - acc_ab; + acc_ab = 1 - acc_ab; auto rectifier = (!signbit(acc_ab)); - dist[outidx] = raft::mySqrt(rectifier * acc_ab); + dist[outidx] = raft::mySqrt(rectifier * acc_ab); } template -__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor, DataType p) { +__global__ void naiveLpUnexpDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + bool isRowMajor, + DataType p) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = raft::L1Op()(a - b); acc += raft::myPow(diff, p); } auto one_over_p = 1 / p; - acc = raft::myPow(acc, one_over_p); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; - dist[outidx] = acc; + acc = raft::myPow(acc, one_over_p); + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; } template -void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, - int n, int k, raft::distance::DistanceType type, - bool isRowMajor, DataType metric_arg = 2.0f) { +void naiveDistance(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + raft::distance::DistanceType type, + bool isRowMajor, + DataType metric_arg = 2.0f) +{ static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); @@ -178,23 +195,19 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, case raft::distance::DistanceType::L2Unexpanded: case raft::distance::DistanceType::L2SqrtExpanded: case raft::distance::DistanceType::L2Expanded: - naiveDistanceKernel - <<>>(dist, x, y, m, n, k, type, isRowMajor); + naiveDistanceKernel<<>>(dist, x, y, m, n, k, type, isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - naiveCosineDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveCosineDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - naiveHellingerDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveHellingerDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: naiveLpUnexpDistanceKernel <<>>(dist, x, y, m, n, k, isRowMajor, metric_arg); break; - default: - FAIL() << "should be here\n"; + default: FAIL() << "should be here\n"; } CUDA_CHECK(cudaPeekAtLastError()); } @@ -209,37 +222,47 @@ struct DistanceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const DistanceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const DistanceInputs& dims) +{ return os; } template -void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2, - int m, int n, int k, DistanceInputs ¶ms, - DataType threshold, char *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor, - DataType metric_arg = 2.0f) { +void distanceLauncher(DataType* x, + DataType* y, + DataType* dist, + DataType* dist2, + int m, + int n, + int k, + DistanceInputs& params, + DataType threshold, + char* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor, + DataType metric_arg = 2.0f) +{ auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) { dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val; return d_val; }; raft::distance::distance( - x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, - metric_arg); + x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); } template class DistanceTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; DataType metric_arg = params.metric_arg; - bool isRowMajor = params.isRowMajor; + bool isRowMajor = params.isRowMajor; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(x, m * k); @@ -256,25 +279,33 @@ class DistanceTest : public ::testing::TestWithParam> { r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); } - naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, - metric_arg); - char *workspace = nullptr; + naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, metric_arg); + char* workspace = nullptr; size_t worksize = - raft::distance::getWorkspaceSize(x, y, m, n, k); - if (worksize != 0) { - raft::allocate(workspace, worksize); - } + raft::distance::getWorkspaceSize(x, y, m, n, k); + if (worksize != 0) { raft::allocate(workspace, worksize); } DataType threshold = -10000.f; - distanceLauncher(x, y, dist, dist2, m, n, k, params, - threshold, workspace, worksize, - stream, isRowMajor, metric_arg); + distanceLauncher(x, + y, + dist, + dist2, + m, + n, + k, + params, + threshold, + workspace, + worksize, + stream, + isRowMajor, + metric_arg); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(workspace)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(x)); CUDA_CHECK(cudaFree(y)); CUDA_CHECK(cudaFree(dist_ref)); diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 4573a070b6..a7b763a2bc 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -29,40 +29,40 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - DI KVP operator()(const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce template -__global__ void naiveKernel(cub::KeyValuePair *min, DataT *x, - DataT *y, int m, int n, int k, int *workspace, - DataT maxVal) { - int midx = threadIdx.y + blockIdx.y * blockDim.y; - int nidx = threadIdx.x + blockIdx.x * blockDim.x; +__global__ void naiveKernel(cub::KeyValuePair* min, + DataT* x, + DataT* y, + int m, + int n, + int k, + int* workspace, + DataT maxVal) +{ + int midx = threadIdx.y + blockIdx.y * blockDim.y; + int nidx = threadIdx.x + blockIdx.x * blockDim.x; DataT acc = DataT(0); for (int i = 0; i < k; ++i) { - int xidx = i + midx * k; - int yidx = i + nidx * k; + int xidx = i + midx * k; + int yidx = i + nidx * k; auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx]; acc += diff * diff; } - if (Sqrt) { - acc = raft::mySqrt(acc); - } + if (Sqrt) { acc = raft::mySqrt(acc); } ReduceOpT redOp; typedef cub::WarpReduce> WarpReduce; __shared__ typename WarpReduce::TempStorage temp[NWARPS]; int warpId = threadIdx.x / raft::WarpSize; cub::KeyValuePair tmp; - tmp.key = nidx; + tmp.key = nidx; tmp.value = midx >= m || nidx >= n ? maxVal : acc; - tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); + tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); if (threadIdx.x % raft::WarpSize == 0 && midx < m) { while (atomicCAS(workspace + midx, 0, 1) == 1) ; @@ -74,8 +74,15 @@ __global__ void naiveKernel(cub::KeyValuePair *min, DataT *x, } template -void naive(cub::KeyValuePair *min, DataT *x, DataT *y, int m, int n, - int k, int *workspace, cudaStream_t stream) { +void naive(cub::KeyValuePair* min, + DataT* x, + DataT* y, + int m, + int n, + int k, + int* workspace, + cudaStream_t stream) +{ static const dim3 TPB(32, 16, 1); dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1); CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); @@ -85,8 +92,7 @@ void naive(cub::KeyValuePair *min, DataT *x, DataT *y, int m, int n, <<>>(min, m, std::numeric_limits::max(), op); CUDA_CHECK(cudaGetLastError()); naiveKernel, 16> - <<>>(min, x, y, m, n, k, workspace, - std::numeric_limits::max()); + <<>>(min, x, y, m, n, k, workspace, std::numeric_limits::max()); CUDA_CHECK(cudaGetLastError()); } @@ -100,7 +106,8 @@ struct Inputs { template class FusedL2NNTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int m = params.m; @@ -121,7 +128,8 @@ class FusedL2NNTest : public ::testing::TestWithParam> { raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, true, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(x)); @@ -136,25 +144,38 @@ class FusedL2NNTest : public ::testing::TestWithParam> { protected: Inputs params; DataT *x, *y, *xn, *yn; - char *workspace; - cub::KeyValuePair *min, *min_ref; + char* workspace; + cub::KeyValuePair*min, *min_ref; cudaStream_t stream; - virtual void generateGoldenResult() { + virtual void generateGoldenResult() + { int m = params.m; int n = params.n; int k = params.k; - naive(min_ref, x, y, m, n, k, (int *)workspace, stream); + naive(min_ref, x, y, m, n, k, (int*)workspace, stream); } - void runTest(cub::KeyValuePair *out) { + void runTest(cub::KeyValuePair* out) + { int m = params.m; int n = params.n; int k = params.k; MinAndDistanceReduceOp redOp; - fusedL2NN, int>( - out, x, y, xn, yn, m, n, k, (void *)workspace, redOp, - raft::distance::KVPMinReduce(), Sqrt, true, stream); + fusedL2NN, int>(out, + x, + y, + xn, + yn, + m, + n, + k, + (void*)workspace, + redOp, + raft::distance::KVPMinReduce(), + Sqrt, + true, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } }; @@ -163,9 +184,10 @@ template struct CompareApproxAbsKVP { typedef typename cub::KeyValuePair KVP; CompareApproxAbsKVP(T eps_) : eps(eps_) {} - bool operator()(const KVP &a, const KVP &b) const { - T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); - T m = std::max(raft::abs(a.value), raft::abs(b.value)); + bool operator()(const KVP& a, const KVP& b) const + { + T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); + T m = std::max(raft::abs(a.value), raft::abs(b.value)); T ratio = m >= eps ? diff / m : diff; return (ratio <= eps); } @@ -177,17 +199,20 @@ struct CompareApproxAbsKVP { template struct CompareExactKVP { typedef typename cub::KeyValuePair KVP; - bool operator()(const KVP &a, const KVP &b) const { + bool operator()(const KVP& a, const KVP& b) const + { if (a.value != b.value) return false; return true; } }; template -::testing::AssertionResult devArrMatch(const cub::KeyValuePair *expected, - const cub::KeyValuePair *actual, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +::testing::AssertionResult devArrMatch(const cub::KeyValuePair* expected, + const cub::KeyValuePair* actual, + size_t size, + L eq_compare, + cudaStream_t stream = 0) +{ typedef typename cub::KeyValuePair KVP; std::shared_ptr exp_h(new KVP[size]); std::shared_ptr act_h(new KVP[size]); @@ -199,47 +224,42 @@ template auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { return ::testing::AssertionFailure() - << "actual=" << act.key << "," << act.value - << " != expected=" << exp.key << "," << exp.value << " @" << i; + << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << "," + << exp.value << " @" << i; } } return ::testing::AssertionSuccess(); } const std::vector> inputsf = { - {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, - {0.001f, 64, 32, 32, 1234ULL}, {0.001f, 64, 64, 32, 1234ULL}, - {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, + {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, {0.001f, 64, 32, 32, 1234ULL}, + {0.001f, 64, 64, 32, 1234ULL}, {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL}, - {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, - {0.001f, 64, 32, 34, 1234ULL}, {0.001f, 64, 64, 34, 1234ULL}, - {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, + {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, {0.001f, 64, 32, 34, 1234ULL}, + {0.001f, 64, 64, 34, 1234ULL}, {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL}, - {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, - {0.001f, 64, 32, 33, 1234ULL}, {0.001f, 64, 64, 33, 1234ULL}, - {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, + {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, {0.001f, 64, 32, 33, 1234ULL}, + {0.001f, 64, 64, 33, 1234ULL}, {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL}, {0.006f, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestF_Sq; -TEST_P(FusedL2NNTestF_Sq, Result) { +TEST_P(FusedL2NNTestF_Sq, Result) +{ runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf)); typedef FusedL2NNTest FusedL2NNTestF_Sqrt; -TEST_P(FusedL2NNTestF_Sqrt, Result) { +TEST_P(FusedL2NNTestF_Sqrt, Result) +{ runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.00001, 32, 32, 32, 1234ULL}, {0.00001, 32, 64, 32, 1234ULL}, @@ -260,38 +280,38 @@ const std::vector> inputsd = { {0.00001, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestD_Sq; -TEST_P(FusedL2NNTestD_Sq, Result) { +TEST_P(FusedL2NNTestD_Sq, Result) +{ runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd)); typedef FusedL2NNTest FusedL2NNTestD_Sqrt; -TEST_P(FusedL2NNTestD_Sqrt, Result) { +TEST_P(FusedL2NNTestD_Sqrt, Result) +{ runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd)); /// This is to test output determinism of the prim template class FusedL2NNDetTest : public FusedL2NNTest { - void SetUp() override { + void SetUp() override + { FusedL2NNTest::SetUp(); int m = this->params.m; raft::allocate(min1, m); } - void TearDown() override { + void TearDown() override + { FusedL2NNTest::TearDown(); CUDA_CHECK(cudaFree(min1)); } protected: - cub::KeyValuePair *min1; + cub::KeyValuePair* min1; static const int NumRepeats = 100; @@ -299,46 +319,46 @@ class FusedL2NNDetTest : public FusedL2NNTest { }; typedef FusedL2NNDetTest FusedL2NNDetTestF_Sq; -TEST_P(FusedL2NNDetTestF_Sq, Result) { +TEST_P(FusedL2NNDetTestF_Sq, Result) +{ runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestF_Sqrt; -TEST_P(FusedL2NNDetTestF_Sqrt, Result) { +TEST_P(FusedL2NNDetTestF_Sqrt, Result) +{ runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sq; -TEST_P(FusedL2NNDetTestD_Sq, Result) { +TEST_P(FusedL2NNDetTestD_Sq, Result) +{ runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sqrt; -TEST_P(FusedL2NNDetTestD_Sqrt, Result) { +TEST_P(FusedL2NNDetTestD_Sqrt, Result) +{ runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index e6ee09262e..e14841eb54 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -23,7 +23,8 @@ namespace raft { -TEST(Raft, EigenSolvers) { +TEST(Raft, EigenSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -35,10 +36,10 @@ TEST(Raft, EigenSolvers) { index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; - auto stream = h.get_stream(); - auto t_exe_pol = thrust::cuda::par.on(stream); + auto stream = h.get_stream(); + auto t_exe_pol = thrust::cuda::par.on(stream); sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; ASSERT_EQ(nullptr, sm1.row_offsets_); @@ -49,7 +50,7 @@ TEST(Raft, EigenSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; @@ -60,14 +61,13 @@ TEST(Raft, EigenSolvers) { lanczos_solver_t eig_solver{cfg}; - EXPECT_ANY_THROW( - eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); - EXPECT_ANY_THROW( - eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); } -TEST(Raft, SpectralSolvers) { +TEST(Raft, SpectralSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -82,7 +82,7 @@ TEST(Raft, SpectralSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -96,22 +96,19 @@ TEST(Raft, SpectralSolvers) { index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, - seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); auto t_exe_p = thrust::cuda::par.on(stream); - sparse_matrix_t sm{h, nullptr, nullptr, - nullptr, 0, 0}; - EXPECT_ANY_THROW(spectral::partition( - h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; + EXPECT_ANY_THROW( + spectral::partition(h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; - EXPECT_ANY_THROW( - spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); + EXPECT_ANY_THROW(spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 4cb9809844..8023fca319 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -22,7 +22,8 @@ namespace raft { -TEST(Raft, HandleDefault) { +TEST(Raft, HandleDefault) +{ handle_t h; ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); @@ -33,7 +34,8 @@ TEST(Raft, HandleDefault) { ASSERT_NE(nullptr, h.get_cusparse_handle()); } -TEST(Raft, Handle) { +TEST(Raft, Handle) +{ handle_t h(4); ASSERT_EQ(4, h.get_num_internal_streams()); cudaStream_t stream; @@ -44,13 +46,15 @@ TEST(Raft, Handle) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, GetInternalStreams) { +TEST(Raft, GetInternalStreams) +{ handle_t h(4); auto streams = h.get_internal_streams(); ASSERT_EQ(4U, streams.size()); } -TEST(Raft, GetHandleFromPool) { +TEST(Raft, GetHandleFromPool) +{ handle_t parent(4); handle_t child(parent, 2); @@ -64,7 +68,8 @@ TEST(Raft, GetHandleFromPool) { ASSERT_EQ(parent.get_device(), child.get_device()); } -TEST(Raft, GetHandleFromPoolPerf) { +TEST(Raft, GetHandleFromPoolPerf) +{ handle_t parent(100); auto start = curTimeMillis(); for (int i = 0; i < parent.get_num_internal_streams(); i++) { @@ -76,13 +81,13 @@ TEST(Raft, GetHandleFromPoolPerf) { ASSERT_LE(curTimeMillis() - start, 10); } -TEST(Raft, GetHandleStreamViews) { +TEST(Raft, GetHandleStreamViews) +{ handle_t parent(4); handle_t child(parent, 2); ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view()); - ASSERT_EQ(parent.get_internal_stream_view(2).value(), - child.get_stream_view().value()); + ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value()); EXPECT_FALSE(child.get_stream_view().is_default()); } } // namespace raft diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp index 830d085a40..d883de59fe 100644 --- a/cpp/test/integer_utils.cpp +++ b/cpp/test/integer_utils.cpp @@ -20,7 +20,8 @@ namespace raft { -TEST(Raft, rounding_up) { +TEST(Raft, rounding_up) +{ ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2); ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0); ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1); @@ -29,7 +30,8 @@ TEST(Raft, rounding_up) { ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1); } -TEST(Raft, is_a_power_of_two) { +TEST(Raft, is_a_power_of_two) +{ ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true); ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false); } diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index dc2846fdba..209bb0355a 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -36,7 +36,8 @@ class labelTest : public ::testing::Test { }; typedef labelTest MakeMonotonicTest; -TEST_F(MakeMonotonicTest, Result) { +TEST_F(MakeMonotonicTest, Result) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -48,17 +49,14 @@ TEST_F(MakeMonotonicTest, Result) { raft::allocate(actual, m, true); raft::allocate(expected, m, true); - float *data_h = - new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; + float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; - float *expected_h = - new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; + float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; raft::update_device(data, data_h, m, stream); raft::update_device(expected, expected_h, m, stream); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); + std::shared_ptr allocator(new raft::mr::device::default_allocator); make_monotonic(actual, data, m, stream, allocator); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -73,37 +71,36 @@ TEST_F(MakeMonotonicTest, Result) { delete expected_h; } -TEST(labelTest, Classlabels) { +TEST(labelTest, Classlabels) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr allocator( - new raft::mr::device::default_allocator); + std::shared_ptr allocator(new raft::mr::device::default_allocator); int n_rows = 6; - float *y_d; + float* y_d; raft::allocate(y_d, n_rows); float y_h[] = {2, -1, 1, 2, 1, 1}; raft::update_device(y_d, y_h, n_rows, stream); int n_classes; - float *y_unique_d; + float* y_unique_d; getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator); ASSERT_EQ(n_classes, 3); float y_unique_exp[] = {-1, 1, 2}; - EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, - raft::Compare(), stream)); + EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, raft::Compare(), stream)); - float *y_relabeled_d; + float* y_relabeled_d; raft::allocate(y_relabeled_d, n_rows); getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream); float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1}; - EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, - raft::Compare(), stream)); + EXPECT_TRUE( + devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, raft::Compare(), stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(y_d)); diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index a2f14a8dbc..3d930ff22e 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -39,8 +39,7 @@ struct MergeLabelsInputs { }; template -class MergeLabelsTest - : public ::testing::TestWithParam> { +class MergeLabelsTest : public ::testing::TestWithParam> { protected: MergeLabelsTest() : params(::testing::TestWithParam>::GetParam()), @@ -50,25 +49,23 @@ class MergeLabelsTest expected(params.N, stream), R(params.N, stream), mask(params.N, stream), - m(1, stream) {} - - void Run() { - raft::update_device(labels_a.data(), params.labels_a.data(), params.N, - stream); - raft::update_device(labels_b.data(), params.labels_b.data(), params.N, - stream); - raft::update_device(expected.data(), params.expected.data(), params.N, - stream); - raft::update_device(mask.data(), - reinterpret_cast(params.mask.data()), params.N, - stream); - - merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(), - m.data(), params.N, stream); + m(1, stream) + { + } + + void Run() + { + raft::update_device(labels_a.data(), params.labels_a.data(), params.N, stream); + raft::update_device(labels_b.data(), params.labels_b.data(), params.N, stream); + raft::update_device(expected.data(), params.expected.data(), params.N, stream); + raft::update_device(mask.data(), reinterpret_cast(params.mask.data()), params.N, stream); + + merge_labels( + labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream); cudaStreamSynchronize(stream); - ASSERT_TRUE(raft::devArrMatch(expected.data(), labels_a.data(), - params.N, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch( + expected.data(), labels_a.data(), params.N, raft::Compare())); } protected: @@ -85,22 +82,14 @@ TEST_P(MergeLabelsTestI, Result) { Run(); } using MergeLabelsTestL = MergeLabelsTest; TEST_P(MergeLabelsTestL, Result) { Run(); } -constexpr int MAX32 = std::numeric_limits::max(); +constexpr int MAX32 = std::numeric_limits::max(); constexpr int64_t MAX64 = std::numeric_limits::max(); const std::vector> merge_inputs_32 = { {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, - {1, 2, 1, 4, 5, MAX32}, - {1, 2, MAX32, 4, 5, 4}, - {1, 1, 0, 1, 1, 0}, - {1, 2, 1, 4, 5, 4}}, - {6, - {1, 2, 2, 2, 2, 6}, - {1, 1, 1, 5, 5, 5}, - {1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1}}, + {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, + {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX32, 1, 3, MAX32}, {1, 2, 3, 2, MAX32, 2, 2, 2}, @@ -116,16 +105,8 @@ const std::vector> merge_inputs_32 = { const std::vector> merge_inputs_64 = { {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, - {1, 2, 1, 4, 5, MAX64}, - {1, 2, MAX64, 4, 5, 4}, - {1, 1, 0, 1, 1, 0}, - {1, 2, 1, 4, 5, 4}}, - {6, - {1, 2, 2, 2, 2, 6}, - {1, 1, 1, 5, 5, 5}, - {1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1}}, + {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, + {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX64, 1, 3, MAX64}, {1, 2, 3, 2, MAX64, 2, 2, 2}, @@ -138,10 +119,8 @@ const std::vector> merge_inputs_64 = { {1, 1, 1, 1, 1, 7, 7, 7}}, }; -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, - ::testing::ValuesIn(merge_inputs_32)); -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, - ::testing::ValuesIn(merge_inputs_64)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64)); } // namespace label } // namespace raft diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu index 04f473f836..61c7182c72 100644 --- a/cpp/test/lap/lap.cu +++ b/cpp/test/lap/lap.cu @@ -29,11 +29,11 @@ #include #include -#define PROBLEMSIZE 1000 // Number of rows/columns -#define BATCHSIZE 10 // Number of problems in the batch -#define COSTRANGE 1000 +#define PROBLEMSIZE 1000 // Number of rows/columns +#define BATCHSIZE 10 // Number of problems in the batch +#define COSTRANGE 1000 #define PROBLEMCOUNT 1 -#define REPETITIONS 1 +#define REPETITIONS 1 #define SEED 01010001 @@ -43,41 +43,43 @@ namespace raft { // Function for generating problem with uniformly distributed integer costs between [0, COSTRANGE]. template -void generateProblem(weight_t *cost_matrix, int SP, int N, int costrange) { +void generateProblem(weight_t* cost_matrix, int SP, int N, int costrange) +{ long N2 = SP * N * N; std::uniform_int_distribution distribution(0, costrange); for (long i = 0; i < N2; i++) { - int val = distribution(generator); + int val = distribution(generator); cost_matrix[i] = (weight_t)val; } } template -void hungarian_test(int problemsize, int costrange, int problemcount, - int repetitions, int batchsize, weight_t epsilon, - bool verbose = false) { +void hungarian_test(int problemsize, + int costrange, + int problemcount, + int repetitions, + int batchsize, + weight_t epsilon, + bool verbose = false) +{ raft::handle_t handle; - weight_t *h_cost = new weight_t[batchsize * problemsize * problemsize]; + weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize]; for (int j = 0; j < problemcount; j++) { generateProblem(h_cost, batchsize, problemsize, costrange); raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize * problemsize); + handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize * problemsize); raft::mr::device::buffer row_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); + handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize); raft::mr::device::buffer col_assignment_v( - handle.get_device_allocator(), handle.get_stream(), - batchsize * problemsize); + handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize); - raft::update_device(elements_v.data(), h_cost, - batchsize * problemsize * problemsize, - handle.get_stream()); + raft::update_device( + elements_v.data(), h_cost, batchsize * problemsize * problemsize, handle.get_stream()); for (int i = 0; i < repetitions; i++) { float start = omp_get_wtime(); @@ -87,20 +89,18 @@ void hungarian_test(int problemsize, int costrange, int problemcount, handle, problemsize, batchsize, epsilon); // Solve LAP(s) for given cost matrix - lpx.solve(elements_v.data(), row_assignment_v.data(), - col_assignment_v.data()); + lpx.solve(elements_v.data(), row_assignment_v.data(), col_assignment_v.data()); float end = omp_get_wtime(); float total_time = (end - start); if (verbose) { - // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual objectives. At optimality both values should match. + // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual + // objectives. At optimality both values should match. for (int k = 0; k < batchsize; k++) { - std::cout << j << ":" << i << ":" << k << ":" - << lpx.getPrimalObjectiveValue(k) << ":" - << lpx.getDualObjectiveValue(k) << ":" << total_time - << std::endl; + std::cout << j << ":" << i << ":" << k << ":" << lpx.getPrimalObjectiveValue(k) << ":" + << lpx.getDualObjectiveValue(k) << ":" << total_time << std::endl; } } } @@ -109,34 +109,38 @@ void hungarian_test(int problemsize, int costrange, int problemcount, delete[] h_cost; } -TEST(Raft, HungarianIntFloat) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianIntFloat) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianIntDouble) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianIntDouble) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianIntLong) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, long{0}); +TEST(Raft, HungarianIntLong) +{ + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); } -TEST(Raft, HungarianLongFloat) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianLongFloat) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianLongDouble) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, - REPETITIONS, BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianLongDouble) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianLongLong) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, long{0}); +TEST(Raft, HungarianLongLong) +{ + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); } } // namespace raft diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index 2fc9d4e30f..38e189f27e 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -27,7 +27,8 @@ namespace linalg { template class AddTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -42,7 +43,8 @@ class AddTest : public ::testing::TestWithParam> { add(out, in1, in2, len, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); @@ -51,9 +53,10 @@ class AddTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void compare() { - ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); + void compare() + { + ASSERT_TRUE( + raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } protected: diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh index 137419758f..1d9352bfc1 100644 --- a/cpp/test/linalg/add.cuh +++ b/cpp/test/linalg/add.cuh @@ -23,18 +23,17 @@ namespace raft { namespace linalg { template -__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2, - int len) { +__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = OutT(in1[idx] + in2[idx]); - } + if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); } } template -void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) { +void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -47,8 +46,8 @@ struct AddInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const AddInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const AddInputs& dims) +{ return os; } diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index 3ae4f86066..078c41356a 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -29,20 +29,19 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void binaryOpLaunch( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); } template -class BinaryOpTest - : public ::testing::TestWithParam> { +class BinaryOpTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - BinaryOpInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; @@ -59,7 +58,8 @@ class BinaryOpTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(out_ref)); @@ -72,67 +72,61 @@ class BinaryOpTest OutType *out_ref, *out; }; -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32; -TEST_P(BinaryOpTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i64; -TEST_P(BinaryOpTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32_D; -TEST_P(BinaryOpTestF_i32_D, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32_D, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, - ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i32; -TEST_P(BinaryOpTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i64; -TEST_P(BinaryOpTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); template class BinaryOpAlignment : public ::testing::Test { protected: - BinaryOpAlignment() { + BinaryOpAlignment() + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void Misaligned() { + void Misaligned() + { // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly // chosen. int n = 1024; @@ -142,8 +136,12 @@ class BinaryOpAlignment : public ::testing::Test { CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); raft::linalg::binaryOp( - z.data() + 9, x.data() + 137, y.data() + 19, 256, - [] __device__(math_t x, math_t y) { return x + y; }, stream); + z.data() + 9, + x.data() + 137, + y.data() + 19, + 256, + [] __device__(math_t x, math_t y) { return x + y; }, + stream); } raft::handle_t handle; diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh index fd8ed6dd1e..97cb3ecb24 100644 --- a/cpp/test/linalg/binary_op.cuh +++ b/cpp/test/linalg/binary_op.cuh @@ -24,18 +24,17 @@ namespace raft { namespace linalg { template -__global__ void naiveAddKernel(OutType *out, const InType *in1, - const InType *in2, IdxType len) { +__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); - if (idx < len) { - out[idx] = static_cast(in1[idx] + in2[idx]); - } + if (idx < len) { out[idx] = static_cast(in1[idx] + in2[idx]); } } template -void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) { +void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len) +{ static const IdxType TPB = 64; - IdxType nblks = raft::ceildiv(len, TPB); + IdxType nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -48,8 +47,8 @@ struct BinaryOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const BinaryOpInputs &d) { +::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs& d) +{ return os; } diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 00236d53fa..5bbe3166cf 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -36,7 +36,8 @@ class CholeskyR1Test : public ::testing::Test { L(allocator, handle.get_stream(), n_rows * n_rows), L_exp(allocator, handle.get_stream(), n_rows * n_rows), devInfo(allocator, handle.get_stream(), 1), - workspace(allocator, handle.get_stream()) { + workspace(allocator, handle.get_stream()) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(G.data(), G_host, n_rows * n_rows, stream); @@ -48,55 +49,58 @@ class CholeskyR1Test : public ::testing::Test { int n_bytes = 0; // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace // requirements. - raft::linalg::choleskyRank1Update(handle, L.data(), n_rows, n_rows, nullptr, - &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); + raft::linalg::choleskyRank1Update( + handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes); workspace.resize(Lwork, stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testR1Update() { + void testR1Update() + { int n = n_rows * n_rows; - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, - CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), n, stream); for (int rank = 1; rank <= n_rows; rank++) { std::stringstream ss; - ss << "Rank " << rank - << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); + ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); SCOPED_TRACE(ss.str()); // Expected solution using Cholesky factorization from scratch raft::copy(L_exp.data(), G.data(), n, stream); - CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf( - solver_handle, uplo, rank, L_exp.data(), n_rows, - (math_t*)workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle, + uplo, + rank, + L_exp.data(), + n_rows, + (math_t*)workspace.data(), + Lwork, + devInfo.data(), + stream)); // Incremental Cholesky factorization using rank one updates. - raft::linalg::choleskyRank1Update(handle, L.data(), rank, n_rows, - workspace.data(), &Lwork, uplo, - stream); + raft::linalg::choleskyRank1Update( + handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream); - ASSERT_TRUE(raft::devArrMatch(L_exp.data(), L.data(), n_rows * rank, - raft::CompareApprox(3e-3))); + ASSERT_TRUE(raft::devArrMatch( + L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox(3e-3))); } } } - void testR1Error() { + void testR1Error() + { raft::update_device(G.data(), G2_host, 4, stream); - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, - CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), 4, stream); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream)); - ASSERT_THROW( - raft::linalg::choleskyRank1Update( - handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), - raft::exception); + ASSERT_THROW(raft::linalg::choleskyRank1Update( + handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), + raft::exception); math_t eps = std::numeric_limits::epsilon(); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index e45f5651b4..2760d522bc 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -33,8 +33,8 @@ struct coalescedReductionInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const coalescedReductionInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs& dims) +{ return os; } @@ -42,17 +42,18 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows, - cudaStream_t stream, bool inplace = false) { - coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace, - [] __device__(T in, int i) { return in * in; }); +void coalescedReductionLaunch( + T* dots, const T* data, int cols, int rows, cudaStream_t stream, bool inplace = false) +{ + coalescedReduction( + dots, data, cols, rows, (T)0, stream, inplace, [] __device__(T in, int i) { return in * in; }); } template -class coalescedReductionTest - : public ::testing::TestWithParam> { +class coalescedReductionTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; @@ -73,7 +74,8 @@ class coalescedReductionTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -84,34 +86,36 @@ class coalescedReductionTest T *data, *dots_exp, *dots_act; }; -const std::vector> inputsf = { - {0.000002f, 1024, 32, 1234ULL}, - {0.000002f, 1024, 64, 1234ULL}, - {0.000002f, 1024, 128, 1234ULL}, - {0.000002f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = {{0.000002f, 1024, 32, 1234ULL}, + {0.000002f, 1024, 64, 1234ULL}, + {0.000002f, 1024, 128, 1234ULL}, + {0.000002f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = { - {0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef coalescedReductionTest coalescedReductionTestF; -TEST_P(coalescedReductionTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); } typedef coalescedReductionTest coalescedReductionTestD; -TEST_P(coalescedReductionTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, + coalescedReductionTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, + coalescedReductionTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 2396558939..d8995ffa0a 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -25,30 +25,27 @@ namespace raft { namespace linalg { template -__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar, - int len) { +__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in[idx] / scalar; - } + if (idx < len) { out[idx] = in[idx] / scalar; } } template -void naiveDivide(Type *out, const Type *in, Type scalar, int len, - cudaStream_t stream) { +void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveDivideKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } template -class DivideTest - : public ::testing::TestWithParam> { +class DivideTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; cudaStream_t stream; @@ -63,7 +60,8 @@ class DivideTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -74,25 +72,21 @@ class DivideTest T *in, *out_ref, *out; }; -const std::vector> inputsf = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef DivideTest DivideTestF; -TEST_P(DivideTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(DivideTestF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf)); typedef DivideTest DivideTestD; -const std::vector> inputsd = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(DivideTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(DivideTestD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 159d288174..5cad657dab 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -35,14 +35,16 @@ struct EigInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const EigInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EigInputs& dims) +{ return os; } template class EigTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { raft::handle_t handle; stream = handle.get_stream(); @@ -51,8 +53,8 @@ class EigTest : public ::testing::TestWithParam> { int len = params.len; raft::allocate(cov_matrix, len); - T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, - 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = { + 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); @@ -61,10 +63,23 @@ class EigTest : public ::testing::TestWithParam> { raft::allocate(eig_vectors_jacobi, len); raft::allocate(eig_vals_jacobi, params.n_col); - T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874, - 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, - 0.4874, 0.5123, 0.5123, 0.4874}; - T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; + T eig_vectors_ref_h[] = {0.2790, + -0.6498, + 0.6498, + -0.2789, + -0.5123, + 0.4874, + 0.4874, + -0.5123, + 0.6498, + 0.2789, + -0.2789, + -0.6498, + 0.4874, + 0.5123, + 0.5123, + 0.4874}; + T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; raft::allocate(eig_vectors_ref, len); raft::allocate(eig_vals_ref, params.n_col); @@ -72,13 +87,19 @@ class EigTest : public ::testing::TestWithParam> { raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream); - eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals, - stream); + eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals, stream); - T tol = 1.e-7; + T tol = 1.e-7; int sweeps = 15; - eigJacobi(handle, cov_matrix, params.n_row, params.n_col, - eig_vectors_jacobi, eig_vals_jacobi, stream, tol, sweeps); + eigJacobi(handle, + cov_matrix, + params.n_row, + params.n_col, + eig_vectors_jacobi, + eig_vals_jacobi, + stream, + tol, + sweeps); // test code for comparing two methods len = params.n * params.n; @@ -90,14 +111,20 @@ class EigTest : public ::testing::TestWithParam> { r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream); - eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large, - eig_vals_large, stream); - eigJacobi(handle, cov_matrix_large, params.n, params.n, - eig_vectors_jacobi_large, eig_vals_jacobi_large, stream, tol, + eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large, eig_vals_large, stream); + eigJacobi(handle, + cov_matrix_large, + params.n, + params.n, + eig_vectors_jacobi_large, + eig_vals_jacobi_large, + stream, + tol, sweeps); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(cov_matrix)); CUDA_CHECK(cudaFree(eig_vectors)); CUDA_CHECK(cudaFree(eig_vectors_jacobi)); @@ -109,89 +136,95 @@ class EigTest : public ::testing::TestWithParam> { protected: EigInputs params; - T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals, - *eig_vals_jacobi, *eig_vals_ref; + T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals, *eig_vals_jacobi, + *eig_vals_ref; - T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large, - *eig_vals_large, *eig_vals_jacobi_large; + T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large, *eig_vals_large, + *eig_vals_jacobi_large; cudaStream_t stream; }; -const std::vector> inputsf2 = { - {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = { - {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigTest EigTestValF; -TEST_P(EigTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValD; -TEST_P(EigTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecF; -TEST_P(EigTestVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecD; -TEST_P(EigTestVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiF; -TEST_P(EigTestValJacobiF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiD; -TEST_P(EigTestValJacobiD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiF; -TEST_P(EigTestVecJacobiF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref, + eig_vectors_jacobi, + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiD; -TEST_P(EigTestVecJacobiD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref, + eig_vectors_jacobi, + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareF; -TEST_P(EigTestVecCompareF, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n), - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_large, + eig_vectors_jacobi_large, + (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareD; -TEST_P(EigTestVecCompareD, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n), - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_large, + eig_vectors_jacobi_large, + (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2)); @@ -202,17 +235,13 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index b3980f281d..b3cfb19174 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -37,32 +37,44 @@ struct EigSelInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const EigSelInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EigSelInputs& dims) +{ return os; } template class EigSelTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { raft::handle_t handle; stream = handle.get_stream(); - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); int len = params.len; raft::allocate(cov_matrix, len); - T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, - 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = { + 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); raft::allocate(eig_vectors, 12); raft::allocate(eig_vals, params.n_col); - T eig_vectors_ref_h[] = {-0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, - -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; - T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; + T eig_vectors_ref_h[] = {-0.5123, + 0.4874, + 0.4874, + -0.5123, + 0.6498, + 0.2789, + -0.2789, + -0.6498, + 0.4874, + 0.5123, + 0.5123, + 0.4874}; + T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; raft::allocate(eig_vectors_ref, 12); raft::allocate(eig_vals_ref, params.n_col); @@ -70,11 +82,19 @@ class EigSelTest : public ::testing::TestWithParam> { raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream); - eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors, - eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream); + eigSelDC(handle, + cov_matrix, + params.n_row, + params.n_col, + 3, + eig_vectors, + eig_vals, + EigVecMemUsage::OVERWRITE_INPUT, + stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(cov_matrix)); CUDA_CHECK(cudaFree(eig_vectors)); CUDA_CHECK(cudaFree(eig_vals)); @@ -89,51 +109,45 @@ class EigSelTest : public ::testing::TestWithParam> { cudaStream_t stream; }; -const std::vector> inputsf2 = { - {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = { - {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigSelTest EigSelTestValF; -TEST_P(EigSelTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestValD; -TEST_P(EigSelTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecF; -TEST_P(EigSelTestVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors, 12, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecD; -TEST_P(EigSelTestVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref, eig_vectors, 12, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index 572951c557..f0e04403e8 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -26,19 +26,17 @@ namespace linalg { //// Testing unary ops template -__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar, - int len) { +__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = scalar * in[idx]; - } + if (idx < len) { out[idx] = scalar * in[idx]; } } template -void naiveScale(Type *out, const Type *in, Type scalar, int len, - cudaStream_t stream) { +void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveScaleKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -52,19 +50,19 @@ struct ScalarMultiplyInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const ScalarMultiplyInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const ScalarMultiplyInputs& dims) +{ return os; } template -class ScalarMultiplyTest - : public ::testing::TestWithParam> { +class ScalarMultiplyTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int len = params.len; + int len = params.len; T scalar = params.scalar; cudaStream_t stream; @@ -78,7 +76,8 @@ class ScalarMultiplyTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -89,46 +88,41 @@ class ScalarMultiplyTest T *in, *out_ref, *out; }; -const std::vector> inputsf1 = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf1 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; const std::vector> inputsd1 = { {0.00000001, 1024 * 1024, 2.0, 1234ULL}}; typedef ScalarMultiplyTest ScalarMultiplyTestF; -TEST_P(ScalarMultiplyTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } typedef ScalarMultiplyTest ScalarMultiplyTestD; -TEST_P(ScalarMultiplyTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, - ::testing::ValuesIn(inputsf1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1)); -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, - ::testing::ValuesIn(inputsd1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::ValuesIn(inputsd1)); //// Testing binary ops template -__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2, - int len) { +__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] + in2[idx]; - } + if (idx < len) { out[idx] = in1[idx] + in2[idx]; } } template -void naiveAdd(Type *out, const Type *in1, const Type *in2, int len, - cudaStream_t stream) { +void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -141,15 +135,16 @@ struct EltwiseAddInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const EltwiseAddInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EltwiseAddInputs& dims) +{ return os; } template class EltwiseAddTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -167,7 +162,8 @@ class EltwiseAddTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(out_ref)); @@ -179,29 +175,25 @@ class EltwiseAddTest : public ::testing::TestWithParam> { T *in1, *in2, *out_ref, *out; }; -const std::vector> inputsf2 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef EltwiseAddTest EltwiseAddTestF; -TEST_P(EltwiseAddTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } typedef EltwiseAddTest EltwiseAddTestD; -TEST_P(EltwiseAddTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu index cecfc5eb8e..e95dbbc502 100644 --- a/cpp/test/linalg/gemm_layout.cu +++ b/cpp/test/linalg/gemm_layout.cu @@ -36,9 +36,9 @@ struct GemmLayoutInputs { // Reference GEMM implementation. template -__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, - bool isZColMajor, bool isXColMajor, - bool isYColMajor) { +__global__ void naiveGemm( + T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor) +{ int tidx = blockIdx.x * blockDim.x + threadIdx.x; int tidy = blockIdx.y * blockDim.y + threadIdx.y; @@ -51,7 +51,7 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, temp += X[xIndex] * Y[yIndex]; } int zIndex = isZColMajor ? m + n * M : m * N + n; - Z[zIndex] = temp; + Z[zIndex] = temp; } } } @@ -59,7 +59,8 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, template class GemmLayoutTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; @@ -72,8 +73,8 @@ class GemmLayoutTest : public ::testing::TestWithParam> { // Dimensions of Y : K x N // Dimensions of Z : M x N - T *X = NULL; // Argument X - T *Y = NULL; // Argument Y + T* X = NULL; // Argument X + T* Y = NULL; // Argument Y size_t xElems = params.M * params.K; size_t yElems = params.K * params.N; @@ -87,27 +88,35 @@ class GemmLayoutTest : public ::testing::TestWithParam> { r.uniform(X, xElems, T(-10.0), T(10.0), stream); r.uniform(Y, yElems, T(-10.0), T(10.0), stream); - dim3 blocks(raft::ceildiv(params.M, 128), - raft::ceildiv(params.N, 4), 1); + dim3 blocks(raft::ceildiv(params.M, 128), raft::ceildiv(params.N, 4), 1); dim3 threads(128, 4, 1); - naiveGemm<<>>(refZ, X, Y, params.M, params.N, params.K, - params.zLayout, params.xLayout, - params.yLayout); - - gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout, - params.xLayout, params.yLayout, stream); + naiveGemm<<>>( + refZ, X, Y, params.M, params.N, params.K, params.zLayout, params.xLayout, params.yLayout); + + gemm(handle, + Z, + X, + Y, + params.M, + params.N, + params.K, + params.zLayout, + params.xLayout, + params.yLayout, + stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(refZ)); CUDA_CHECK(cudaFree(Z)); } protected: GemmLayoutInputs params; - T *refZ = NULL; // Reference result for comparison - T *Z = NULL; // Computed result + T* refZ = NULL; // Reference result for comparison + T* Z = NULL; // Computed result }; const std::vector> inputsf = { @@ -131,22 +140,20 @@ const std::vector> inputsd = { {50, 80, 60, false, false, false, 893038ULL}}; typedef GemmLayoutTest GemmLayoutTestF; -TEST_P(GemmLayoutTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, - raft::CompareApprox(1e-4))); +TEST_P(GemmLayoutTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-4))); } typedef GemmLayoutTest GemmLayoutTestD; -TEST_P(GemmLayoutTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, - raft::CompareApprox(1e-6))); +TEST_P(GemmLayoutTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-6))); } -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 227bce6a48..0e33d9758f 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -26,13 +26,22 @@ namespace raft { namespace linalg { template -void mapLaunch(OutType *out, const InType *in1, const InType *in2, - const InType *in3, InType scalar, IdxType len, - cudaStream_t stream) { +void mapLaunch(OutType* out, + const InType* in1, + const InType* in2, + const InType* in3, + InType scalar, + IdxType len, + cudaStream_t stream) +{ map( - out, len, + out, + len, [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; }, - stream, in1, in2, in3); + stream, + in1, + in2, + in3); } template @@ -44,10 +53,15 @@ struct MapInputs { }; template -void create_ref(OutType *out_ref, const InType *in1, const InType *in2, - const InType *in3, InType scalar, IdxType len, - cudaStream_t stream) { - InType *tmp; +void create_ref(OutType* out_ref, + const InType* in1, + const InType* in2, + const InType* in3, + InType scalar, + IdxType len, + cudaStream_t stream) +{ + InType* tmp; allocate(tmp, len); eltwiseAdd(tmp, in1, in2, len, stream); eltwiseAdd(out_ref, tmp, in3, len, stream); @@ -56,12 +70,11 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2, } template -class MapTest - : public ::testing::TestWithParam> { +class MapTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; @@ -81,7 +94,8 @@ class MapTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(in3)); @@ -95,55 +109,47 @@ class MapTest OutType *out_ref, *out; }; -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 1234ULL, 3.2}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}}; typedef MapTest MapTestF_i32; -TEST_P(MapTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = { - {0.000001f, 1024 * 1024, 1234ULL, 9.4}}; +const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}}; typedef MapTest MapTestF_i64; -TEST_P(MapTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL, 5.9}}; typedef MapTest MapTestF_i32_D; -TEST_P(MapTestF_i32_D, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32_D, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, - ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = { - {0.00000001, 1024 * 1024, 1234ULL, 7.5}}; +const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}}; typedef MapTest MapTestD_i32; -TEST_P(MapTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestD_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL, 5.2}}; typedef MapTest MapTestD_i64; -TEST_P(MapTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestD_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index 6e146fa4bb..a1b82e7644 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -25,21 +25,18 @@ namespace raft { namespace linalg { template -__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len, - MapOp map) { +__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - raft::myAtomicAdd(out, (OutType)map(in[idx])); - } + if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); } } template -void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map, - cudaStream_t stream) { +void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, (size_t)TPB); - naiveMapReduceKernel - <<>>(out, in, len, map); + int nblks = raft::ceildiv(len, (size_t)TPB); + naiveMapReduceKernel<<>>(out, in, len, map); CUDA_CHECK(cudaPeekAtLastError()); } @@ -51,7 +48,8 @@ struct MapReduceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MapReduceInputs& dims) +{ return os; } @@ -59,8 +57,9 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in, - size_t len, cudaStream_t stream) { +void mapReduceLaunch( + OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream) +{ auto op = [] __device__(InType in) { return in; }; naiveMapReduce(out_ref, in, len, op, stream); mapThenSumReduce(out, len, op, 0, in); @@ -69,7 +68,8 @@ void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in, template class MapReduceTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); auto len = params.len; @@ -84,7 +84,8 @@ class MapReduceTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -92,48 +93,44 @@ class MapReduceTest : public ::testing::TestWithParam> { protected: MapReduceInputs params; - InType *in; + InType* in; OutType *out_ref, *out; }; -const std::vector> inputsf = { - {0.001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = {{0.001f, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestFF; -TEST_P(MapReduceTestFF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf)); typedef MapReduceTest MapReduceTestFD; -TEST_P(MapReduceTestFD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf)); -const std::vector> inputsd = { - {0.000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = {{0.000001, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestDD; -TEST_P(MapReduceTestDD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestDD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd)); template class MapGenericReduceTest : public ::testing::Test { - using InType = typename T::first_type; + using InType = typename T::first_type; using OutType = typename T::second_type; protected: MapGenericReduceTest() : allocator(handle.get_device_allocator()), input(allocator, handle.get_stream(), n), - output(allocator, handle.get_stream(), 1) { + output(allocator, handle.get_stream(), 1) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); @@ -142,7 +139,8 @@ class MapGenericReduceTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void initInput(InType *input, int n, cudaStream_t stream) { + void initInput(InType* input, int n, cudaStream_t stream) + { raft::random::Rng r(137); r.uniform(input, n, InType(2), InType(3), stream); InType val = 1; @@ -151,21 +149,19 @@ class MapGenericReduceTest : public ::testing::Test { raft::update_device(input + 337, &val, 1, stream); } - void testMin() { - auto op = [] __device__(InType in) { return in; }; + void testMin() + { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::max(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, - input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, - raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare())); } - void testMax() { - auto op = [] __device__(InType in) { return in; }; + void testMax() + { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::min(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, - input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, - raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare())); } protected: @@ -178,8 +174,7 @@ class MapGenericReduceTest : public ::testing::Test { }; using IoTypePair = - ::testing::Types, std::pair, - std::pair>; + ::testing::Types, std::pair, std::pair>; TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair); TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); } diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index aa46c78b0f..6ad9bfba10 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -32,8 +32,8 @@ struct MatVecOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const MatVecOpInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs& dims) +{ return os; } @@ -41,26 +41,48 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, bool useTwoVectors, - cudaStream_t stream) { +void matrixVectorOpLaunch(T* out, + const T* in, + const T* vec1, + const T* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + bool useTwoVectors, + cudaStream_t stream) +{ if (useTwoVectors) { matrixVectorOp( - out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows, - [] __device__(T a, T b, T c) { return a + b + c; }, stream); + out, + in, + vec1, + vec2, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(T a, T b, T c) { return a + b + c; }, + stream); } else { matrixVectorOp( - out, in, vec1, D, N, rowMajor, bcastAlongRows, - [] __device__(T a, T b) { return a + b; }, stream); + out, + in, + vec1, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(T a, T b) { return a + b; }, + stream); } } template -class MatVecOpTest - : public ::testing::TestWithParam> { +class MatVecOpTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); IdxType N = params.rows, D = params.cols; @@ -78,18 +100,25 @@ class MatVecOpTest r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream); r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream); if (params.useTwoVectors) { - naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor, - params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0); } else { - naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor, - params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0); } - matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor, - params.bcastAlongRows, params.useTwoVectors, stream); + matrixVectorOpLaunch(out, + in, + vec1, + vec2, + D, + N, + params.rowMajor, + params.bcastAlongRows, + params.useTwoVectors, + stream); CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(vec1)); CUDA_CHECK(cudaFree(vec2)); CUDA_CHECK(cudaFree(out)); @@ -121,23 +150,23 @@ const std::vector> inputsf_i32 = { {0.00001f, 1024, 32, false, false, true, 1234ULL}, {0.00001f, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i32; -TEST_P(MatVecOpTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.00001f, 2500, 250, false, false, false, 1234ULL}, {0.00001f, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i64; -TEST_P(MatVecOpTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, true, true, false, 1234ULL}, @@ -158,23 +187,23 @@ const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, false, false, true, 1234ULL}, {0.0000001, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i32; -TEST_P(MatVecOpTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestD_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.0000001, 2500, 250, false, false, false, 1234ULL}, {0.0000001, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i64; -TEST_P(MatVecOpTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestD_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh index 69c45c9866..5f9c6f1ef3 100644 --- a/cpp/test/linalg/matrix_vector_op.cuh +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -22,9 +22,15 @@ namespace raft { namespace linalg { template -__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Type scalar) { +__global__ void naiveMatVecKernel(Type* out, + const Type* mat, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -37,27 +43,37 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, } else { col = idx / N; } - if (idx < len) { - out[idx] = mat[idx] + scalar * vec[col]; - } + if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; } } template -void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) { +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel - <<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel<<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, - Type scalar) { +__global__ void naiveMatVecKernel(Type* out, + const Type* mat, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -70,20 +86,25 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, } else { col = idx / N; } - if (idx < len) { - out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; - } + if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; } } template -void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2, - IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar) { +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel<<>>(out, mat, vec1, vec2, D, N, rowMajor, - bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel + <<>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index 1d3e753de3..6c38d89891 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -27,7 +27,8 @@ namespace linalg { template class MultiplyTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -43,7 +44,8 @@ class MultiplyTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -54,25 +56,21 @@ class MultiplyTest : public ::testing::TestWithParam> { T *in, *out_ref, *out; }; -const std::vector> inputsf = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef MultiplyTest MultiplyTestF; -TEST_P(MultiplyTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(MultiplyTestF, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, ::testing::ValuesIn(inputsf)); typedef MultiplyTest MultiplyTestD; -const std::vector> inputsd = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(MultiplyTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(MultiplyTestD, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index acc25addd0..35bc72dee4 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -34,17 +34,19 @@ struct NormInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const NormInputs &I) { - os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " - << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl; +::std::ostream& operator<<(::std::ostream& os, const NormInputs& I) +{ + os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", " + << I.do_sqrt << ", " << I.seed << '}' << std::endl; return os; } ///// Row-wise norm test definitions template -__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, - NormType type, bool do_sqrt) { - Type acc = (Type)0; +__global__ void naiveRowNormKernel( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) +{ + Type acc = (Type)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { @@ -59,19 +61,20 @@ __global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, } template -void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type, - bool do_sqrt, cudaStream_t stream) { +void naiveRowNorm( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveRowNormKernel - <<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(N, TPB); + naiveRowNormKernel<<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } template class RowNormTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -82,19 +85,18 @@ class RowNormTest : public ::testing::TestWithParam> { raft::allocate(dots_exp, rows); raft::allocate(dots_act, rows); r.uniform(data, len, T(-1.0), T(1.0), stream); - naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, - stream); + naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, - fin_op); + rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op); } else { rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -109,10 +111,11 @@ class RowNormTest : public ::testing::TestWithParam> { ///// Column-wise norm test definitisons template -__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, - NormType type, bool do_sqrt) { +__global__ void naiveColNormKernel( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) +{ int colID = threadIdx.x + blockIdx.x * blockDim.x; - if (colID > D) return; //avoid out-of-bounds thread + if (colID > D) return; // avoid out-of-bounds thread Type acc = 0; for (int i = 0; i < N; i++) { @@ -124,19 +127,20 @@ __global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, } template -void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type, - bool do_sqrt, cudaStream_t stream) { +void naiveColNorm( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(D, TPB); - naiveColNormKernel - <<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(D, TPB); + naiveColNormKernel<<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } template class ColNormTest : public ::testing::TestWithParam> { public: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -148,19 +152,18 @@ class ColNormTest : public ::testing::TestWithParam> { raft::allocate(dots_exp, cols); raft::allocate(dots_act, cols); - naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, - stream); + naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, - fin_op); + colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op); } else { colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -174,24 +177,23 @@ class ColNormTest : public ::testing::TestWithParam> { }; ///// Row- and column-wise tests -const std::vector> inputsf = { - {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, - - {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, + + {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; const std::vector> inputsd = { {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL}, @@ -213,22 +215,22 @@ const std::vector> inputsd = { {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}}; typedef RowNormTest RowNormTestF; -TEST_P(RowNormTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); } typedef RowNormTest RowNormTestD; -TEST_P(RowNormTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd)); const std::vector> inputscf = { {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL}, @@ -269,22 +271,22 @@ const std::vector> inputscd = { {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}}; typedef ColNormTest ColNormTestF; -TEST_P(ColNormTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); } typedef ColNormTest ColNormTestD; -TEST_P(ColNormTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, - ::testing::ValuesIn(inputscf)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf)); -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, - ::testing::ValuesIn(inputscd)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 9082397265..85c84777e4 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -34,8 +34,8 @@ struct ReduceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const ReduceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const ReduceInputs& dims) +{ return os; } @@ -43,45 +43,55 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void reduceLaunch(OutType *dots, const InType *data, int cols, int rows, - bool rowMajor, bool alongRows, bool inplace, - cudaStream_t stream) { - reduce( - dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace, - [] __device__(InType in, int i) { return static_cast(in * in); }); +void reduceLaunch(OutType* dots, + const InType* data, + int cols, + int rows, + bool rowMajor, + bool alongRows, + bool inplace, + cudaStream_t stream) +{ + reduce(dots, + data, + cols, + rows, + (OutType)0, + rowMajor, + alongRows, + stream, + inplace, + [] __device__(InType in, int i) { return static_cast(in * in); }); } template -class ReduceTest - : public ::testing::TestWithParam> { +class ReduceTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); - params = - ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; - outlen = params.alongRows ? rows : cols; + outlen = params.alongRows ? rows : cols; raft::allocate(data, len); raft::allocate(dots_exp, outlen); raft::allocate(dots_act, outlen); r.uniform(data, len, InType(-1.0), InType(1.0), stream); - naiveReduction(dots_exp, data, cols, rows, params.rowMajor, - params.alongRows, stream); + naiveReduction(dots_exp, data, cols, rows, params.rowMajor, params.alongRows, stream); // Perform reduction with default inplace = false first - reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, - false, stream); + reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, false, stream); // Add to result with inplace = true next, which shouldn't affect // in the case of coalescedReduction! if (!(params.rowMajor ^ params.alongRows)) { - reduceLaunch(dots_act, data, cols, rows, params.rowMajor, - params.alongRows, true, stream); + reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, true, stream); } } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -90,7 +100,7 @@ class ReduceTest protected: ReduceInputs params; - InType *data; + InType* data; OutType *dots_exp, *dots_act; int outlen; cudaStream_t stream; @@ -151,31 +161,31 @@ const std::vector> inputsfd = { {0.000002f, 1024, 256, false, false, 1234ULL}}; typedef ReduceTest ReduceTestFF; -TEST_P(ReduceTestFF, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFF, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestDD; -TEST_P(ReduceTestDD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestDD, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestFD; -TEST_P(ReduceTestFD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFD, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, - ::testing::ValuesIn(inputsff)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, - ::testing::ValuesIn(inputsdd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, - ::testing::ValuesIn(inputsfd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 30a9c2e271..86f9c2d8b8 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -26,52 +26,69 @@ namespace raft { namespace linalg { template -__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data, - int D, int N) { - OutType acc = (OutType)0; +__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N) +{ + OutType acc = (OutType)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { - acc += - static_cast(data[rowStart * D + i] * data[rowStart * D + i]); + acc += static_cast(data[rowStart * D + i] * data[rowStart * D + i]); } dots[rowStart] = 2 * acc; } } template -void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N, - cudaStream_t stream) { +void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveCoalescedReductionKernel - <<>>(dots, data, D, N); + int nblks = raft::ceildiv(N, TPB); + naiveCoalescedReductionKernel<<>>(dots, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); } template -void unaryAndGemv(OutType *dots, const InType *data, int D, int N, - cudaStream_t stream) { - //computes a MLCommon unary op on data (squares it), then computes Ax +void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) +{ + // computes a MLCommon unary op on data (squares it), then computes Ax //(A input matrix and x column vector) to sum columns thrust::device_vector sq(D * N); raft::linalg::unaryOp( - thrust::raw_pointer_cast(sq.data()), data, D * N, - [] __device__(InType v) { return static_cast(v * v); }, stream); + thrust::raw_pointer_cast(sq.data()), + data, + D * N, + [] __device__(InType v) { return static_cast(v * v); }, + stream); cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle)); - thrust::device_vector ones(N, 1); //column vector [1...1] + thrust::device_vector ones(N, 1); // column vector [1...1] OutType alpha = 1, beta = 0; - CUBLAS_CHECK(raft::linalg::cublasgemv( - handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D, - thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream)); + CUBLAS_CHECK(raft::linalg::cublasgemv(handle, + CUBLAS_OP_N, + D, + N, + &alpha, + thrust::raw_pointer_cast(sq.data()), + D, + thrust::raw_pointer_cast(ones.data()), + 1, + &beta, + dots, + 1, + stream)); CUDA_CHECK(cudaDeviceSynchronize()); CUBLAS_CHECK(cublasDestroy(handle)); } template -void naiveReduction(OutType *dots, const InType *data, int D, int N, - bool rowMajor, bool alongRows, cudaStream_t stream) { +void naiveReduction(OutType* dots, + const InType* data, + int D, + int N, + bool rowMajor, + bool alongRows, + cudaStream_t stream) +{ if (rowMajor && alongRows) { naiveCoalescedReduction(dots, data, D, N, stream); } else if (rowMajor && !alongRows) { diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index b27fa2ac1a..57699cb050 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -32,17 +32,17 @@ struct stridedReductionInputs { }; template -void stridedReductionLaunch(T *dots, const T *data, int cols, int rows, - cudaStream_t stream) { - stridedReduction(dots, data, cols, rows, (T)0, stream, false, - [] __device__(T in, int i) { return in * in; }); +void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream) +{ + stridedReduction( + dots, data, cols, rows, (T)0, stream, false, [] __device__(T in, int i) { return in * in; }); } template -class stridedReductionTest - : public ::testing::TestWithParam> { +class stridedReductionTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -50,16 +50,17 @@ class stridedReductionTest int len = rows * cols; raft::allocate(data, len); - raft::allocate(dots_exp, cols); //expected dot products (from test) - raft::allocate(dots_act, cols); //actual dot products (from prim) + raft::allocate(dots_exp, cols); // expected dot products (from test) + raft::allocate(dots_act, cols); // actual dot products (from prim) r.uniform(data, len, T(-1.0), T(1.0), - stream); //initialize matrix to random + stream); // initialize matrix to random unaryAndGemv(dots_exp, data, cols, rows, stream); stridedReductionLaunch(dots_act, data, cols, rows, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -72,35 +73,33 @@ class stridedReductionTest cudaStream_t stream; }; -const std::vector> inputsf = { - {0.00001f, 1024, 32, 1234ULL}, - {0.00001f, 1024, 64, 1234ULL}, - {0.00001f, 1024, 128, 1234ULL}, - {0.00001f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 32, 1234ULL}, + {0.00001f, 1024, 64, 1234ULL}, + {0.00001f, 1024, 128, 1234ULL}, + {0.00001f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = { - {0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef stridedReductionTest stridedReductionTestF; -TEST_P(stridedReductionTestF, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); } typedef stridedReductionTest stridedReductionTestD; -TEST_P(stridedReductionTestD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index ced3f65fdd..4295b91f3e 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -24,39 +24,34 @@ namespace raft { namespace linalg { template -__global__ void naiveSubtractElemKernel(Type *out, const Type *in1, - const Type *in2, int len) { +__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] - in2[idx]; - } + if (idx < len) { out[idx] = in1[idx] - in2[idx]; } } template -void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len, - cudaStream_t stream) { +void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveSubtractElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1, - const Type in2, int len) { +__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] - in2; - } + if (idx < len) { out[idx] = in1[idx] - in2; } } template -void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len, - cudaStream_t stream) { +void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveSubtractScalarKernel - <<>>(out, in1, in2, len); + int nblks = raft::ceildiv(len, TPB); + naiveSubtractScalarKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -68,14 +63,16 @@ struct SubtractInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SubtractInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SubtractInputs& dims) +{ return os; } template class SubtractTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -98,7 +95,8 @@ class SubtractTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(out_ref)); @@ -110,35 +108,33 @@ class SubtractTest : public ::testing::TestWithParam> { T *in1, *in2, *out_ref, *out; }; -const std::vector> inputsf2 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef SubtractTest SubtractTestF; -TEST_P(SubtractTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestF, Result) +{ + ASSERT_TRUE( + raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox(params.tolerance))); } typedef SubtractTest SubtractTestD; -TEST_P(SubtractTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestD, Result) +{ + ASSERT_TRUE( + raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index fff321768f..e9e1a6dc02 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -35,19 +35,21 @@ struct SvdInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SvdInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SvdInputs& dims) +{ return os; } template class SvdTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { raft::handle_t handle; params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int len = params.len; + int len = params.len; cudaStream_t stream = handle.get_stream(); raft::allocate(data, len); @@ -56,7 +58,7 @@ class SvdTest : public ::testing::TestWithParam> { T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0}; raft::update_device(data, data_h, len, stream); - int left_evl = params.n_row * params.n_col; + int left_evl = params.n_row * params.n_col; int right_evl = params.n_col * params.n_col; raft::allocate(left_eig_vectors_qr, left_evl); @@ -67,8 +69,7 @@ class SvdTest : public ::testing::TestWithParam> { // allocate(right_eig_vectors_trans_jacobi, right_evl); // allocate(sing_vals_jacobi, params.n_col); - T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, - 0.488195, 0.110706, -0.865685}; + T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, 0.488195, 0.110706, -0.865685}; T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636}; @@ -78,18 +79,25 @@ class SvdTest : public ::testing::TestWithParam> { raft::allocate(right_eig_vectors_ref, right_evl); raft::allocate(sing_vals_ref, params.n_col); - raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, - stream); - raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h, - right_evl, stream); + raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, stream); + raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h, right_evl, stream); raft::update_device(sing_vals_ref, sing_vals_ref_h, params.n_col, stream); - svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr, - left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true, + svdQR(handle, + data, + params.n_row, + params.n_col, + sing_vals_qr, + left_eig_vectors_qr, + right_eig_vectors_trans_qr, + true, + true, + true, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(left_eig_vectors_qr)); CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr)); @@ -101,69 +109,71 @@ class SvdTest : public ::testing::TestWithParam> { protected: SvdInputs params; - T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, - *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref; + T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, *left_eig_vectors_ref, + *right_eig_vectors_ref, *sing_vals_ref; }; -const std::vector> inputsf2 = { - {0.00001f, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsf2 = {{0.00001f, 3 * 2, 3, 2, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00001, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsd2 = {{0.00001, 3 * 2, 3, 2, 1234ULL}}; typedef SvdTest SvdTestValF; -TEST_P(SvdTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestValD; -TEST_P(SvdTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecF; -TEST_P(SvdTestLeftVecF, Result) { - ASSERT_TRUE(raft::devArrMatch( - left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref, + left_eig_vectors_qr, + params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecD; -TEST_P(SvdTestLeftVecD, Result) { - ASSERT_TRUE(raft::devArrMatch( - left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref, + left_eig_vectors_qr, + params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecF; -TEST_P(SvdTestRightVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr, - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref, + right_eig_vectors_trans_qr, + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecD; -TEST_P(SvdTestRightVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr, - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref, + right_eig_vectors_trans_qr, + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2)); // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, // ::testing::ValuesIn(inputsf2)); diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index f10b029962..659bed04c6 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -34,14 +34,16 @@ struct TranposeInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const TranposeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const TranposeInputs& dims) +{ return os; } template class TransposeTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); stream = handle.get_stream(); @@ -63,7 +65,8 @@ class TransposeTest : public ::testing::TestWithParam> { transpose(data, params.n_row, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(data_trans)); CUDA_CHECK(cudaFree(data_trans_ref)); @@ -76,39 +79,33 @@ class TransposeTest : public ::testing::TestWithParam> { cudaStream_t stream; }; -const std::vector> inputsf2 = { - {0.1f, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}}; -const std::vector> inputsd2 = { - {0.1, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}}; typedef TransposeTest TransposeTestValF; -TEST_P(TransposeTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref, data_trans, params.len, - raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref, data, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + data_trans_ref, data_trans, params.len, raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + data_trans_ref, data, params.len, raft::CompareApproxAbs(params.tolerance))); } typedef TransposeTest TransposeTestValD; -TEST_P(TransposeTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref, data_trans, params.len, - raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref, data, params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + data_trans_ref, data_trans, params.len, raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch( + data_trans_ref, data, params.len, raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 666ab8619d..6349a1907a 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -28,28 +28,25 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ if (in == nullptr) { auto op = [scalar] __device__(OutType * ptr, IdxType idx) { *ptr = static_cast(scalar * idx); }; writeOnlyUnaryOp(out, len, op, stream); } else { - auto op = [scalar] __device__(InType in) { - return static_cast(in * scalar); - }; + auto op = [scalar] __device__(InType in) { return static_cast(in * scalar); }; unaryOp(out, in, len, op, stream); } } template -class UnaryOpTest - : public ::testing::TestWithParam> { +class UnaryOpTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - UnaryOpInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); CUDA_CHECK(cudaStreamCreate(&stream)); auto len = params.len; @@ -59,7 +56,8 @@ class UnaryOpTest r.uniform(in, len, InType(-1.0), InType(1.0), stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(in)); @@ -67,18 +65,18 @@ class UnaryOpTest CUDA_CHECK(cudaFree(out)); } - virtual void DoTest() { - auto len = params.len; + virtual void DoTest() + { + auto len = params.len; auto scalar = params.scalar; naiveScale(out_ref, in, scalar, len, stream); unaryOpLaunch(out, in, scalar, len, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); } UnaryOpInputs params; - InType *in; + InType* in; OutType *out_ref, *out; cudaStream_t stream; }; @@ -86,14 +84,15 @@ class UnaryOpTest template class WriteOnlyUnaryOpTest : public UnaryOpTest { protected: - void DoTest() override { - auto len = this->params.len; + void DoTest() override + { + auto len = this->params.len; auto scalar = this->params.scalar; - naiveScale(this->out_ref, (OutType *)nullptr, scalar, len, this->stream); - unaryOpLaunch(this->out, (OutType *)nullptr, scalar, len, this->stream); + naiveScale(this->out_ref, (OutType*)nullptr, scalar, len, this->stream); + unaryOpLaunch(this->out, (OutType*)nullptr, scalar, len, this->stream); CUDA_CHECK(cudaStreamSynchronize(this->stream)); - ASSERT_TRUE(devArrMatch(this->out_ref, this->out, this->params.len, - CompareApprox(this->params.tolerance))); + ASSERT_TRUE(devArrMatch( + this->out_ref, this->out, this->params.len, CompareApprox(this->params.tolerance))); } }; @@ -101,8 +100,7 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest { TEST_P(Name, Result) { DoTest(); } \ INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs)) -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef UnaryOpTest UnaryOpTestF_i32; UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32); typedef WriteOnlyUnaryOpTest WriteOnlyUnaryOpTestF_i32; diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh index be3f1124c5..3343389af8 100644 --- a/cpp/test/linalg/unary_op.cuh +++ b/cpp/test/linalg/unary_op.cuh @@ -24,8 +24,8 @@ namespace raft { namespace linalg { template -__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, - IdxType len) { +__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); if (idx < len) { if (in == nullptr) { @@ -38,12 +38,11 @@ __global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, } template -void naiveScale(OutType *out, const InType *in, InType scalar, int len, - cudaStream_t stream) { +void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveScaleKernel - <<>>(out, in, scalar, len); + int nblks = raft::ceildiv(len, TPB); + naiveScaleKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -56,8 +55,8 @@ struct UnaryOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const UnaryOpInputs &d) { +::std::ostream& operator<<(::std::ostream& os, const UnaryOpInputs& d) +{ return os; } diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 578139623a..9cdd36b252 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -24,53 +24,51 @@ namespace raft { namespace matrix { template -__global__ void nativePowerKernel(Type *in, Type *out, int len) { +__global__ void nativePowerKernel(Type* in, Type* out, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in[idx] * in[idx]; - } + if (idx < len) { out[idx] = in[idx] * in[idx]; } } template -void naivePower(Type *in, Type *out, int len, cudaStream_t stream) { +void naivePower(Type* in, Type* out, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativePowerKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void nativeSqrtKernel(Type *in, Type *out, int len) { +__global__ void nativeSqrtKernel(Type* in, Type* out, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = sqrt(in[idx]); - } + if (idx < len) { out[idx] = sqrt(in[idx]); } } template -void naiveSqrt(Type *in, Type *out, int len) { +void naiveSqrt(Type* in, Type* out, int len) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativeSqrtKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, - int colCount) { +__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount) +{ int d_i = blockIdx.x * rowCount; int end = d_i + rowCount; if (blockIdx.x < colCount) { - Type max = 0.0; + Type max = 0.0; int max_index = 0; for (int i = d_i; i < end; i++) { Type val = in[i]; - if (val < 0.0) { - val = -val; - } + if (val < 0.0) { val = -val; } if (val > max) { - max = val; + max = val; max_index = i; } } @@ -88,7 +86,8 @@ __global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, } template -void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) { +void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount) +{ naiveSignFlipKernel<<>>(in, out, rowCount, colCount); CUDA_CHECK(cudaPeekAtLastError()); } @@ -103,14 +102,16 @@ struct MathInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MathInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MathInputs& dims) +{ return os; } template class MathTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); random::Rng r(params.seed); int len = params.len; @@ -154,7 +155,7 @@ class MathTest : public ::testing::TestWithParam> { allocate(in_recip_ref, 4); allocate(out_recip, 4); // default threshold is 1e-15 - std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; + std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; std::vector in_recip_ref_h = {10.0, 100.0, -100.0, 0.0}; update_device(in_recip, in_recip_h.data(), 4, stream); update_device(in_recip_ref, in_recip_ref_h.data(), 4, stream); @@ -165,7 +166,7 @@ class MathTest : public ::testing::TestWithParam> { reciprocal(in_recip, recip_scalar, 4, stream, true); - std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; + std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; std::vector in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1}; allocate(in_smallzero, 4); allocate(out_smallzero, 4); @@ -177,7 +178,8 @@ class MathTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in_power)); CUDA_CHECK(cudaFree(out_power_ref)); CUDA_CHECK(cudaFree(in_sqrt)); @@ -196,137 +198,129 @@ class MathTest : public ::testing::TestWithParam> { protected: MathInputs params; - T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, - *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref, - *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref; + T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, *out_ratio_ref, *in_sign_flip, + *out_sign_flip_ref, *in_recip, *in_recip_ref, *out_recip, *in_smallzero, *out_smallzero, + *out_smallzero_ref; }; -const std::vector> inputsf = { - {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd = { - {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = {{0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; typedef MathTest MathPowerTestF; -TEST_P(MathPowerTestF, Result) { - ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathPowerTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_power, out_power_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathPowerTestD; -TEST_P(MathPowerTestD, Result) { - ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathPowerTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_power, out_power_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestF; -TEST_P(MathSqrtTestF, Result) { - ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestD; -TEST_P(MathSqrtTestD, Result) { - ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestF; -TEST_P(MathRatioTestF, Result) { - ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathRatioTestF, Result) +{ + ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestD; -TEST_P(MathRatioTestD, Result) { - ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathRatioTestD, Result) +{ + ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestF; -TEST_P(MathSignFlipTestF, Result) { - ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + in_sign_flip, out_sign_flip_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestD; -TEST_P(MathSignFlipTestD, Result) { - ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + in_sign_flip, out_sign_flip_ref, params.len, CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestF; -TEST_P(MathReciprocalTestF, Result) { - ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestF, Result) +{ + ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestD; -TEST_P(MathReciprocalTestD, Result) { - ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestD, Result) +{ + ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestF; -TEST_P(MathSetSmallZeroTestF, Result) { - ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestD; -TEST_P(MathSetSmallZeroTestD, Result) { - ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4, - CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, - ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, ::testing::ValuesIn(inputsd)); } // namespace matrix } // namespace raft diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index 28222c0697..fc5a418bda 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -32,14 +32,16 @@ struct MatrixInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MatrixInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MatrixInputs& dims) +{ return os; } template class MatrixTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.n_row * params.n_col; @@ -54,13 +56,14 @@ class MatrixTest : public ::testing::TestWithParam> { // copy(in1, in1_revr, params.n_row, params.n_col); // colReverse(in1_revr, params.n_row, params.n_col); - T *outTrunc; + T* outTrunc; raft::allocate(outTrunc, 6); truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream); CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); // CUDA_CHECK(cudaFree(in1_revr)); @@ -73,31 +76,30 @@ class MatrixTest : public ::testing::TestWithParam> { const std::vector> inputsf2 = {{0.000001f, 4, 4, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 4, 4, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 4, 4, 1234ULL}}; typedef MatrixTest MatrixTestF; -TEST_P(MatrixTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col, - raft::CompareApprox(params.tolerance))); +TEST_P(MatrixTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + in1, in2, params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); } typedef MatrixTest MatrixTestD; -TEST_P(MatrixTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col, - raft::CompareApprox(params.tolerance))); +TEST_P(MatrixTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + in1, in2, params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, ::testing::ValuesIn(inputsd2)); template class MatrixCopyRowsTest : public ::testing::Test { - using math_t = typename std::tuple_element<0, T>::type; - using idx_t = typename std::tuple_element<1, T>::type; + using math_t = typename std::tuple_element<0, T>::type; + using idx_t = typename std::tuple_element<1, T>::type; using idx_array_t = typename std::tuple_element<2, T>::type; protected: @@ -105,42 +107,38 @@ class MatrixCopyRowsTest : public ::testing::Test { : allocator(handle.get_device_allocator()), input(allocator, handle.get_stream(), n_cols * n_rows), indices(allocator, handle.get_stream(), n_selected), - output(allocator, handle.get_stream(), n_cols * n_selected) { + output(allocator, handle.get_stream(), n_cols * n_selected) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(indices.data(), indices_host, n_selected, stream); // Init input array thrust::counting_iterator first(0); thrust::device_ptr ptr(input.data()); - thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, - ptr); + thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, ptr); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testCopyRows() { - copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), - n_selected, stream, false); - EXPECT_TRUE(raft::devArrMatchHost(output_exp_colmajor, output.data(), - n_selected * n_cols, - raft::Compare())); - copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), - n_selected, stream, true); - EXPECT_TRUE(raft::devArrMatchHost(output_exp_rowmajor, output.data(), - n_selected * n_cols, - raft::Compare())); + void testCopyRows() + { + copyRows( + input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false); + EXPECT_TRUE(raft::devArrMatchHost( + output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare())); + copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true); + EXPECT_TRUE(raft::devArrMatchHost( + output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare())); } protected: - int n_rows = 10; - int n_cols = 3; + int n_rows = 10; + int n_cols = 3; int n_selected = 5; - idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; - math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, - 17, 19, 20, 23, 24, 27, 29}; - math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, - 14, 21, 22, 23, 27, 28, 29}; + idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; + math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, 17, 19, 20, 23, 24, 27, 29}; + math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, 14, 21, 22, 23, 27, 28, 29}; raft::handle_t handle; cudaStream_t stream; std::shared_ptr allocator; @@ -149,10 +147,10 @@ class MatrixCopyRowsTest : public ::testing::Test { raft::mr::device::buffer indices; }; -using TypeTuple = - ::testing::Types, std::tuple, - std::tuple, - std::tuple>; +using TypeTuple = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; TYPED_TEST_CASE(MatrixCopyRowsTest, TypeTuple); TYPED_TEST(MatrixCopyRowsTest, CopyRows) { this->testCopyRows(); } diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp index 223efdbfe8..9ba2c3332b 100644 --- a/cpp/test/mr/device/buffer.cpp +++ b/cpp/test/mr/device/buffer.cpp @@ -25,7 +25,8 @@ namespace raft { namespace mr { namespace device { -TEST(Raft, DeviceBufferAlloc) { +TEST(Raft, DeviceBufferAlloc) +{ auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -52,13 +53,14 @@ TEST(Raft, DeviceBufferAlloc) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceBufferZeroResize) { +TEST(Raft, DeviceBufferZeroResize) +{ // Create a limiting_resource_adaptor to track allocations - auto curr_mr = dynamic_cast( - rmm::mr::get_current_device_resource()); - auto limit_mr = std::make_shared< - rmm::mr::limiting_resource_adaptor>(curr_mr, - 1000); + auto curr_mr = + dynamic_cast(rmm::mr::get_current_device_resource()); + auto limit_mr = + std::make_shared>(curr_mr, + 1000); rmm::mr::set_current_device_resource(limit_mr.get()); diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp index 953f65ddfb..aadf05285c 100644 --- a/cpp/test/mr/host/buffer.cpp +++ b/cpp/test/mr/host/buffer.cpp @@ -24,7 +24,8 @@ namespace raft { namespace mr { namespace host { -TEST(Raft, HostBuffer) { +TEST(Raft, HostBuffer) +{ auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -51,14 +52,14 @@ TEST(Raft, HostBuffer) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceToHostBuffer) { +TEST(Raft, DeviceToHostBuffer) +{ auto d_alloc = std::make_shared(); auto h_alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); device::buffer d_buff(d_alloc, stream, 32); - CUDA_CHECK( - cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); + CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); buffer h_buff(h_alloc, d_buff); ASSERT_EQ(d_buff.size(), h_buff.size()); CUDA_CHECK(cudaStreamSynchronize(stream)); diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index d7aa76500b..5560c61c73 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -54,7 +54,8 @@ namespace mst { // Sequential prims function // Returns total weight of MST template -weight_t prims(CSRHost &csr_h) { +weight_t prims(CSRHost& csr_h) +{ auto n_vertices = csr_h.offsets.size() - 1; bool active_vertex[n_vertices]; @@ -63,19 +64,18 @@ weight_t prims(CSRHost &csr_h) { for (auto i = 0; i < n_vertices; i++) { active_vertex[i] = false; - curr_edge[i] = INT_MAX; + curr_edge[i] = INT_MAX; } curr_edge[0] = 0; // function to pick next min vertex-edge - auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex, - auto n_vertices) { + auto min_vertex_edge = [](auto* curr_edge, auto* active_vertex, auto n_vertices) { weight_t min = INT_MAX; vertex_t min_vertex; for (auto v = 0; v < n_vertices; v++) { if (!active_vertex[v] && curr_edge[v] < min) { - min = curr_edge[v]; + min = curr_edge[v]; min_vertex = v; } } @@ -91,14 +91,13 @@ weight_t prims(CSRHost &csr_h) { active_vertex[curr_v] = true; // set to active // iterate through edges of current active vertex - auto edge_st = csr_h.offsets[curr_v]; + auto edge_st = csr_h.offsets[curr_v]; auto edge_end = csr_h.offsets[curr_v + 1]; for (auto e = edge_st; e < edge_end; e++) { // put edges to be considered for next iteration auto neighbor_idx = csr_h.indices[e]; - if (!active_vertex[neighbor_idx] && - csr_h.weights[e] < curr_edge[neighbor_idx]) { + if (!active_vertex[neighbor_idx] && csr_h.weights[e] < curr_edge[neighbor_idx]) { curr_edge[neighbor_idx] = csr_h.weights[e]; } } @@ -114,99 +113,101 @@ weight_t prims(CSRHost &csr_h) { } template -class MSTTest - : public ::testing::TestWithParam> { +class MSTTest : public ::testing::TestWithParam> { protected: std::pair, raft::Graph_COO> - mst_gpu() { - edge_t *offsets = static_cast(csr_d.offsets.data()); - vertex_t *indices = static_cast(csr_d.indices.data()); - weight_t *weights = static_cast(csr_d.weights.data()); + mst_gpu() + { + edge_t* offsets = static_cast(csr_d.offsets.data()); + vertex_t* indices = static_cast(csr_d.indices.data()); + weight_t* weights = static_cast(csr_d.weights.data()); v = static_cast((csr_d.offsets.size() / sizeof(vertex_t)) - 1); e = static_cast(csr_d.indices.size() / sizeof(edge_t)); - rmm::device_vector mst_src(2 * v - 2, - std::numeric_limits::max()); - rmm::device_vector mst_dst(2 * v - 2, - std::numeric_limits::max()); + rmm::device_vector mst_src(2 * v - 2, std::numeric_limits::max()); + rmm::device_vector mst_dst(2 * v - 2, std::numeric_limits::max()); rmm::device_vector color(v, 0); - vertex_t *color_ptr = thrust::raw_pointer_cast(color.data()); + vertex_t* color_ptr = thrust::raw_pointer_cast(color.data()); if (iterations == 0) { MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, true, 0); auto symmetric_result = symmetric_solver.solve(); MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), - std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); } else { - MST_solver intermediate_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, true, iterations); + MST_solver intermediate_solver(handle, + offsets, + indices, + weights, + v, + e, + color_ptr, + handle.get_stream(), + true, + true, + iterations); auto intermediate_result = intermediate_solver.solve(); MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, false, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, false, 0); auto symmetric_result = symmetric_solver.solve(); // symmetric_result.n_edges += intermediate_result.n_edges; - auto total_edge_size = - symmetric_result.n_edges + intermediate_result.n_edges; + auto total_edge_size = symmetric_result.n_edges + intermediate_result.n_edges; symmetric_result.src.resize(total_edge_size, handle.get_stream()); symmetric_result.dst.resize(total_edge_size, handle.get_stream()); symmetric_result.weights.resize(total_edge_size, handle.get_stream()); raft::copy(symmetric_result.src.data() + symmetric_result.n_edges, - intermediate_result.src.data(), intermediate_result.n_edges, + intermediate_result.src.data(), + intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.dst.data() + symmetric_result.n_edges, - intermediate_result.dst.data(), intermediate_result.n_edges, + intermediate_result.dst.data(), + intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.weights.data() + symmetric_result.n_edges, intermediate_result.weights.data(), - intermediate_result.n_edges, handle.get_stream()); + intermediate_result.n_edges, + handle.get_stream()); symmetric_result.n_edges = total_edge_size; MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), - std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); } } - void SetUp() override { - mst_input = ::testing::TestWithParam< - MSTTestInput>::GetParam(); + void SetUp() override + { + mst_input = ::testing::TestWithParam>::GetParam(); iterations = mst_input.iterations; - csr_d.offsets = rmm::device_buffer( - mst_input.csr_h.offsets.data(), - mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream()); - csr_d.indices = rmm::device_buffer( - mst_input.csr_h.indices.data(), - mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream()); - csr_d.weights = rmm::device_buffer( - mst_input.csr_h.weights.data(), - mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream()); + csr_d.offsets = rmm::device_buffer(mst_input.csr_h.offsets.data(), + mst_input.csr_h.offsets.size() * sizeof(edge_t), + handle.get_stream()); + csr_d.indices = rmm::device_buffer(mst_input.csr_h.indices.data(), + mst_input.csr_h.indices.size() * sizeof(vertex_t), + handle.get_stream()); + csr_d.weights = rmm::device_buffer(mst_input.csr_h.weights.data(), + mst_input.csr_h.weights.size() * sizeof(weight_t), + handle.get_stream()); } void TearDown() override {} @@ -259,41 +260,68 @@ const std::vector> csr_in_h = { const std::vector> csr_in4_h = { {{0, 3, 5, 8, 10, 12, 14, 16}, {2, 4, 5, 3, 6, 0, 4, 5, 1, 6, 0, 2, 0, 2, 1, 3}, - {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, - 1.0f, 6.0f, 7.0f, 10.0f}}}; + {5.0f, + 9.0f, + 1.0f, + 8.0f, + 7.0f, + 5.0f, + 2.0f, + 6.0f, + 8.0f, + 10.0f, + 9.0f, + 2.0f, + 1.0f, + 6.0f, + 7.0f, + 10.0f}}}; // singletons const std::vector> csr_in5_h = { {{0, 3, 5, 8, 10, 10, 10, 12, 14, 16, 16}, {2, 8, 7, 3, 8, 0, 8, 7, 1, 8, 0, 2, 0, 2, 1, 3}, - {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, - 1.0f, 6.0f, 7.0f, 10.0f}}}; + {5.0f, + 9.0f, + 1.0f, + 8.0f, + 7.0f, + 5.0f, + 2.0f, + 6.0f, + 8.0f, + 10.0f, + 9.0f, + 2.0f, + 1.0f, + 6.0f, + 7.0f, + 10.0f}}}; typedef MSTTest MSTTestSequential; -TEST_P(MSTTestSequential, Sequential) { - auto results_pair = mst_gpu(); - auto &symmetric_result = results_pair.first; - auto &non_symmetric_result = results_pair.second; +TEST_P(MSTTestSequential, Sequential) +{ + auto results_pair = mst_gpu(); + auto& symmetric_result = results_pair.first; + auto& non_symmetric_result = results_pair.second; // do assertions here // in this case, running sequential MST auto prims_result = prims(mst_input.csr_h); - auto symmetric_sum = - thrust::reduce(thrust::device, symmetric_result.weights.data(), - symmetric_result.weights.data() + symmetric_result.n_edges); - auto non_symmetric_sum = thrust::reduce( - thrust::device, non_symmetric_result.weights.data(), - non_symmetric_result.weights.data() + non_symmetric_result.n_edges); - - ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, - raft::CompareApprox(0.1))); - ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, - raft::CompareApprox(0.1))); + auto symmetric_sum = thrust::reduce(thrust::device, + symmetric_result.weights.data(), + symmetric_result.weights.data() + symmetric_result.n_edges); + auto non_symmetric_sum = + thrust::reduce(thrust::device, + non_symmetric_result.weights.data(), + non_symmetric_result.weights.data() + non_symmetric_result.n_edges); + + ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, raft::CompareApprox(0.1))); + ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, raft::CompareApprox(0.1))); } -INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, - ::testing::ValuesIn(csr_in_h)); +INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, ::testing::ValuesIn(csr_in_h)); } // namespace mst } // namespace raft diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index af10dcab30..25c8fe5084 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -38,12 +38,13 @@ enum RandomType { }; template -__global__ void meanKernel(T* out, const T* data, int len) { +__global__ void meanKernel(T* out, const T* data, int len) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int tid = threadIdx.x + blockIdx.x * blockDim.x; - T val = tid < len ? data[tid] : T(0); - T x = BlockReduce(temp_storage).Sum(val); + T val = tid < len ? data[tid] : T(0); + T x = BlockReduce(temp_storage).Sum(val); __syncthreads(); T xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -70,7 +71,8 @@ struct RngInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) +{ return os; } @@ -80,46 +82,30 @@ template template class RngTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); allocate(data, params.len); allocate(stats, 2, true); switch (params.type) { - case RNG_Normal: - r.normal(data, params.len, params.start, params.end, stream); - break; - case RNG_LogNormal: - r.lognormal(data, params.len, params.start, params.end, stream); - break; - case RNG_Uniform: - r.uniform(data, params.len, params.start, params.end, stream); - break; - case RNG_Gumbel: - r.gumbel(data, params.len, params.start, params.end, stream); - break; - case RNG_Logistic: - r.logistic(data, params.len, params.start, params.end, stream); - break; - case RNG_Exp: - r.exponential(data, params.len, params.start, stream); - break; - case RNG_Rayleigh: - r.rayleigh(data, params.len, params.start, stream); - break; - case RNG_Laplace: - r.laplace(data, params.len, params.start, params.end, stream); - break; + case RNG_Normal: r.normal(data, params.len, params.start, params.end, stream); break; + case RNG_LogNormal: r.lognormal(data, params.len, params.start, params.end, stream); break; + case RNG_Uniform: r.uniform(data, params.len, params.start, params.end, stream); break; + case RNG_Gumbel: r.gumbel(data, params.len, params.start, params.end, stream); break; + case RNG_Logistic: r.logistic(data, params.len, params.start, params.end, stream); break; + case RNG_Exp: r.exponential(data, params.len, params.start, stream); break; + case RNG_Rayleigh: r.rayleigh(data, params.len, params.start, stream); break; + case RNG_Laplace: r.laplace(data, params.len, params.start, params.end, stream); break; }; static const int threads = 128; meanKernel - <<>>(stats, data, - params.len); + <<>>(stats, data, params.len); update_host(h_stats, stats, 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -127,23 +113,24 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(stats)); } - void getExpectedMeanVar(T meanvar[2]) { + void getExpectedMeanVar(T meanvar[2]) + { switch (params.type) { case RNG_Normal: meanvar[0] = params.start; meanvar[1] = params.end * params.end; break; case RNG_LogNormal: { - auto var = params.end * params.end; - auto mu = params.start; + auto var = params.end * params.end; + auto mu = params.start; meanvar[0] = raft::myExp(mu + var * T(0.5)); - meanvar[1] = - (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); + meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); break; } case RNG_Uniform: @@ -167,8 +154,7 @@ class RngTest : public ::testing::TestWithParam> { break; case RNG_Rayleigh: meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0)); - meanvar[1] = - ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; + meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; break; case RNG_Laplace: meanvar[0] = params.start; @@ -259,13 +245,12 @@ const std::vector> inputsf = { {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestF, Result) { +TEST_P(RngTestF, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf)); @@ -321,13 +306,12 @@ const std::vector> inputsd = { {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL}, {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestD, Result) { +TEST_P(RngTestD, Result) +{ double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); @@ -335,7 +319,8 @@ INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); // Test for expected variance in mean calculations template -T quick_mean(const std::vector& d) { +T quick_mean(const std::vector& d) +{ T acc = T(0); for (const auto& di : d) { acc += di; @@ -344,8 +329,9 @@ T quick_mean(const std::vector& d) { } template -T quick_std(const std::vector& d) { - T acc = T(0); +T quick_std(const std::vector& d) +{ + T acc = T(0); T d_mean = quick_mean(d); for (const auto& di : d) { acc += ((di - d_mean) * (di - d_mean)); @@ -354,7 +340,8 @@ T quick_std(const std::vector& d) { } template -std::ostream& operator<<(std::ostream& out, const std::vector& v) { +std::ostream& operator<<(std::ostream& out, const std::vector& v) +{ if (!v.empty()) { out << '['; std::copy(v.begin(), v.end(), std::ostream_iterator(out, ", ")); @@ -369,11 +356,12 @@ std::ostream& operator<<(std::ostream& out, const std::vector& v) { // experiments computing the mean, giving us a distribution of the mean // itself. The mean error is simply the standard deviation of this // distribution (the standard deviation of the mean). -TEST(Rng, MeanError) { +TEST(Rng, MeanError) +{ timeb time_struct; ftime(&time_struct); - int seed = time_struct.millitm; - int num_samples = 1024; + int seed = time_struct.millitm; + int num_samples = 1024; int num_experiments = 1024; float* data; float* mean_result; @@ -391,10 +379,9 @@ TEST(Rng, MeanError) { Rng r(seed, rtype); r.normal(data, len, 3.3f, 0.23f, stream); // r.uniform(data, len, -1.0, 2.0); - raft::stats::mean(mean_result, data, num_samples, num_experiments, false, - false, stream); - raft::stats::stddev(std_result, data, mean_result, num_samples, - num_experiments, false, false, stream); + raft::stats::mean(mean_result, data, num_samples, num_experiments, false, false, stream); + raft::stats::stddev( + std_result, data, mean_result, num_samples, num_experiments, false, false, stream); std::vector h_mean_result(num_experiments); std::vector h_std_result(num_experiments); update_host(h_mean_result.data(), mean_result, num_experiments, stream); @@ -403,8 +390,8 @@ TEST(Rng, MeanError) { auto d_mean = quick_mean(h_mean_result); // std-dev of mean; also known as mean error - auto d_std_of_mean = quick_std(h_mean_result); - auto d_std = quick_mean(h_std_result); + auto d_std_of_mean = quick_std(h_mean_result); + auto d_std = quick_mean(h_std_result); auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples); // std::cout << "measured mean error: " << d_std_of_mean << "\n"; @@ -413,8 +400,7 @@ TEST(Rng, MeanError) { auto diff_expected_vs_measured_mean_error = std::abs(d_std_of_mean - d_std / std::sqrt(num_samples)); - ASSERT_TRUE( - (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); + ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); } CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(data)); @@ -427,7 +413,8 @@ TEST(Rng, MeanError) { template class ScaledBernoulliTest : public ::testing::Test { protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); @@ -438,12 +425,12 @@ class ScaledBernoulliTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaFree(data)); } - void rangeCheck() { + void rangeCheck() + { T* h_data = new T[len]; update_host(h_data, data, len, stream); - ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) { - return a < -scale || a > scale; - })); + ASSERT_TRUE( + std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; })); delete[] h_data; } @@ -460,7 +447,8 @@ TEST_F(ScaledBernoulliTest2, RangeCheck) { rangeCheck(); } template class BernoulliTest : public ::testing::Test { protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); allocate(data, len * sizeof(bool), stream); @@ -469,7 +457,8 @@ class BernoulliTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaFree(data)); } - void trueFalseCheck() { + void trueFalseCheck() + { // both true and false values must be present bool* h_data = new bool[len]; update_host(h_data, data, len, stream); @@ -499,21 +488,21 @@ struct RngNormalTableInputs { }; template -::std::ostream& operator<<(::std::ostream& os, - const RngNormalTableInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const RngNormalTableInputs& dims) +{ return os; } template -class RngNormalTableTest - : public ::testing::TestWithParam> { +class RngNormalTableTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; - params = ::testing::TestWithParam>::GetParam(); - int len = params.rows * params.cols; + params = ::testing::TestWithParam>::GetParam(); + int len = params.rows * params.cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -523,11 +512,9 @@ class RngNormalTableTest allocate(mu_vec, params.cols); r.fill(mu_vec, params.cols, params.mu, stream); T* sigma_vec = nullptr; - r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, - params.sigma, stream); + r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, params.sigma, stream); static const int threads = 128; - meanKernel - <<>>(stats, data, len); + meanKernel<<>>(stats, data, len); update_host(h_stats, stats, 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= len; @@ -535,13 +522,15 @@ class RngNormalTableTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(stats)); CUDA_CHECK(cudaFree(mu_vec)); } - void getExpectedMeanVar(T meanvar[2]) { + void getExpectedMeanVar(T meanvar[2]) + { meanvar[0] = params.mu; meanvar[1] = params.sigma * params.sigma; } @@ -562,16 +551,14 @@ const std::vector> inputsf_t = { {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestF, Result) { +TEST_P(RngNormalTableTestF, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, - ::testing::ValuesIn(inputsf_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, ::testing::ValuesIn(inputsf_t)); typedef RngNormalTableTest RngNormalTableTestD; const std::vector> inputsd_t = { @@ -581,16 +568,14 @@ const std::vector> inputsd_t = { {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL}, {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestD, Result) { +TEST_P(RngNormalTableTestD, Result) +{ double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, - ::testing::ValuesIn(inputsd_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, ::testing::ValuesIn(inputsd_t)); struct RngAffineInputs { int n; @@ -599,13 +584,15 @@ struct RngAffineInputs { class RngAffineTest : public ::testing::TestWithParam { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam::GetParam(); Rng r(params.seed); r.affine_transform_params(params.n, a, b); } - void check() { + void check() + { ASSERT_TRUE(gcd(a, params.n) == 1); ASSERT_TRUE(0 <= b && b < params.n); } @@ -616,13 +603,17 @@ class RngAffineTest : public ::testing::TestWithParam { }; // RngAffineTest const std::vector inputs_affine = { - {100, 123456ULL}, {100, 1234567890ULL}, {101, 123456ULL}, - {101, 1234567890ULL}, {7, 123456ULL}, {7, 1234567890ULL}, - {2568, 123456ULL}, {2568, 1234567890ULL}, + {100, 123456ULL}, + {100, 1234567890ULL}, + {101, 123456ULL}, + {101, 1234567890ULL}, + {7, 123456ULL}, + {7, 1234567890ULL}, + {2568, 123456ULL}, + {2568, 1234567890ULL}, }; TEST_P(RngAffineTest, Result) { check(); } -INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, - ::testing::ValuesIn(inputs_affine)); +INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, ::testing::ValuesIn(inputs_affine)); } // namespace random } // namespace raft diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index 92f12206e8..c77c3df526 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -27,12 +27,13 @@ namespace random { enum RandomType { RNG_Uniform }; template -__global__ void meanKernel(float *out, const T *data, int len) { +__global__ void meanKernel(float* out, const T* data, int len) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - int tid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; float val = tid < len ? data[tid] : T(0); - float x = BlockReduce(temp_storage).Sum(val); + float x = BlockReduce(temp_storage).Sum(val); __syncthreads(); float xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -59,14 +60,16 @@ struct RngInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const RngInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) +{ return os; } template class RngTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); Rng r(params.seed, params.gtype); @@ -75,14 +78,11 @@ class RngTest : public ::testing::TestWithParam> { allocate(data, params.len); allocate(stats, 2, true); switch (params.type) { - case RNG_Uniform: - r.uniformInt(data, params.len, params.start, params.end, stream); - break; + case RNG_Uniform: r.uniformInt(data, params.len, params.start, params.end, stream); break; }; static const int threads = 128; meanKernel - <<>>(stats, data, - params.len); + <<>>(stats, data, params.len); update_host(h_stats, stats, 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -90,12 +90,14 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(stats)); } - void getExpectedMeanVar(float meanvar[2]) { + void getExpectedMeanVar(float meanvar[2]) + { switch (params.type) { case RNG_Uniform: meanvar[0] = (params.start + params.end) * 0.5f; @@ -107,8 +109,8 @@ class RngTest : public ::testing::TestWithParam> { protected: RngInputs params; - T *data; - float *stats; + T* data; + float* stats; float h_stats[2]; // mean, var }; @@ -120,13 +122,12 @@ const std::vector> inputs_u32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU32, Result) { +TEST_P(RngTestU32, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32)); @@ -138,13 +139,12 @@ const std::vector> inputs_u64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU64, Result) { +TEST_P(RngTestU64, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64)); @@ -156,13 +156,12 @@ const std::vector> inputs_s32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS32, Result) { +TEST_P(RngTestS32, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32)); @@ -174,13 +173,12 @@ const std::vector> inputs_s64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS64, Result) { +TEST_P(RngTestS64, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64)); diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index d7e52a8958..c258841c3e 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -38,14 +38,16 @@ struct SWoRInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) +{ return os; } template class SWoRTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); CUDA_CHECK(cudaStreamCreate(&stream)); @@ -58,15 +60,14 @@ class SWoRTest : public ::testing::TestWithParam> { r.uniform(in, params.len, T(-1.0), T(1.0), stream); r.uniform(wts, params.len, T(1.0), T(2.0), stream); if (params.largeWeightIndex >= 0) { - update_device(wts + params.largeWeightIndex, ¶ms.largeWeight, 1, - stream); + update_device(wts + params.largeWeightIndex, ¶ms.largeWeight, 1, stream); } - r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen, - params.len, stream); + r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen, params.len, stream); update_host(&(h_outIdx[0]), outIdx, params.sampledLen, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(in)); @@ -147,14 +148,14 @@ const std::vector> inputsf = { {1024, 512, 10, 100000.f, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestF, Result) { +TEST_P(SWoRTestF, Result) +{ std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val - << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -162,9 +163,7 @@ TEST_P(SWoRTestF, Result) { } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { - ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); - } + if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf)); @@ -231,14 +230,14 @@ const std::vector> inputsd = { {1024, 512, 10, 100000.0, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestD, Result) { +TEST_P(SWoRTestD, Result) +{ std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val - << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -246,9 +245,7 @@ TEST_P(SWoRTestD, Result) { } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { - ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); - } + if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd)); diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index 713708d4cd..e1f814a5b6 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -44,14 +44,14 @@ struct CSRAddInputs { }; template -class CSRAddTest - : public ::testing::TestWithParam> { +class CSRAddTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); - n_rows = params.matrix_a.row_ind.size(); - nnz_a = params.matrix_a.row_ind_ptr.size(); - nnz_b = params.matrix_b.row_ind_ptr.size(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + n_rows = params.matrix_a.row_ind.size(); + nnz_a = params.matrix_a.row_ind_ptr.size(); + nnz_b = params.matrix_b.row_ind_ptr.size(); nnz_result = params.matrix_verify.row_ind_ptr.size(); cudaStreamCreate(&stream); @@ -73,46 +73,61 @@ class CSRAddTest raft::allocate(values_result, nnz_result); } - void Run() { - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void Run() + { + std::shared_ptr alloc(new raft::mr::device::default_allocator); raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream); - raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, - stream); + raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, stream); raft::update_device(values_a, params.matrix_a.values.data(), nnz_a, stream); raft::update_device(ind_b, params.matrix_b.row_ind.data(), n_rows, stream); - raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b, - stream); + raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b, stream); raft::update_device(values_b, params.matrix_b.values.data(), nnz_b, stream); - raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows, - stream); - raft::update_device(ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(), - nnz_result, stream); - raft::update_device(values_verify, params.matrix_verify.values.data(), - nnz_result, stream); - - Index_ nnz = linalg::csr_add_calc_inds( - ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, - n_rows, ind_result, alloc, stream); + raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows, stream); + raft::update_device( + ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(), nnz_result, stream); + raft::update_device(values_verify, params.matrix_verify.values.data(), nnz_result, stream); + + Index_ nnz = linalg::csr_add_calc_inds(ind_a, + ind_ptr_a, + values_a, + nnz_a, + ind_b, + ind_ptr_b, + values_b, + nnz_b, + n_rows, + ind_result, + alloc, + stream); ASSERT_TRUE(nnz == nnz_result); - ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, - raft::Compare())); - - linalg::csr_add_finalize( - ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, - n_rows, ind_result, ind_ptr_result, values_result, stream); - - ASSERT_TRUE(raft::devArrMatch(ind_ptr_verify, ind_ptr_result, nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(values_verify, values_result, nnz, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, raft::Compare())); + + linalg::csr_add_finalize(ind_a, + ind_ptr_a, + values_a, + nnz_a, + ind_b, + ind_ptr_b, + values_b, + nnz_b, + n_rows, + ind_result, + ind_ptr_result, + values_result, + stream); + + ASSERT_TRUE( + raft::devArrMatch(ind_ptr_verify, ind_ptr_result, nnz, raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(values_verify, values_result, nnz, raft::Compare())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(ind_a)); CUDA_CHECK(cudaFree(ind_b)); CUDA_CHECK(cudaFree(ind_result)); @@ -131,8 +146,8 @@ class CSRAddTest CSRAddInputs params; cudaStream_t stream; Index_ n_rows, nnz_a, nnz_b, nnz_result; - Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b, - *ind_ptr_verify, *ind_ptr_result; + Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b, *ind_ptr_verify, + *ind_ptr_result; Type_f *values_a, *values_b, *values_verify, *values_result; }; @@ -165,10 +180,8 @@ const std::vector> csradd_inputs_d = { {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}}, }; -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, - ::testing::ValuesIn(csradd_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, - ::testing::ValuesIn(csradd_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, ::testing::ValuesIn(csradd_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, ::testing::ValuesIn(csradd_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index d98f9de9c3..3678d34bbe 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -51,26 +51,24 @@ struct ConnectComponentsInputs { }; template -class ConnectComponentsTest : public ::testing::TestWithParam< - ConnectComponentsInputs> { +class ConnectComponentsTest + : public ::testing::TestWithParam> { protected: - void basicTest() { + void basicTest() + { raft::handle_t handle; auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); - params = ::testing::TestWithParam< - ConnectComponentsInputs>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); - raft::sparse::COO out_edges( - handle.get_device_allocator(), handle.get_stream()); + raft::sparse::COO out_edges(handle.get_device_allocator(), + handle.get_stream()); - rmm::device_uvector data(params.n_row * params.n_col, - handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); - raft::copy(data.data(), params.data.data(), data.size(), - handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream()); rmm::device_uvector indptr(params.n_row + 1, stream); @@ -79,44 +77,58 @@ class ConnectComponentsTest : public ::testing::TestWithParam< */ raft::sparse::COO knn_graph_coo(d_alloc, stream); - raft::sparse::selection::knn_graph( - handle, data.data(), params.n_row, params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, knn_graph_coo, params.c); + raft::sparse::selection::knn_graph(handle, + data.data(), + params.n_row, + params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, + knn_graph_coo, + params.c); - raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), - knn_graph_coo.nnz, indptr.data(), - params.n_row + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, d_alloc, stream); /** * 2. Construct MST, sorted by weights */ rmm::device_uvector colors(params.n_row, stream); - auto mst_coo = raft::mst::mst( - handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(), - params.n_row, knn_graph_coo.nnz, colors.data(), stream, false, true); + auto mst_coo = raft::mst::mst(handle, + indptr.data(), + knn_graph_coo.cols(), + knn_graph_coo.vals(), + params.n_row, + knn_graph_coo.nnz, + colors.data(), + stream, + false, + true); /** * 3. connect_components to fix connectivities */ - raft::linkage::FixConnectivitiesRedOp red_op( - colors.data(), params.n_row); + raft::linkage::FixConnectivitiesRedOp red_op(colors.data(), params.n_row); raft::linkage::connect_components( - handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, - red_op); + handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op); /** * Construct final edge list */ rmm::device_uvector indptr2(params.n_row + 1, stream); - raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz, - indptr2.data(), params.n_row + 1, - d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr( + out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, d_alloc, stream); - auto output_mst = raft::mst::mst( - handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row, - out_edges.nnz, colors.data(), stream, false, false); + auto output_mst = raft::mst::mst(handle, + indptr2.data(), + out_edges.cols(), + out_edges.vals(), + params.n_row, + out_edges.nnz, + colors.data(), + stream, + false, + false); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -138,366 +150,199 @@ const std::vector> fix_conn_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, -1}, // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, - 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, - 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, - 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, - 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, - 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, - 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, - 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, - 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, - 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, - 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, - 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, - 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, - 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, - 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, - 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, - 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, - 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, - 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, - 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, - 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, - 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, - 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, - 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, - 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, - 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, - 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, - 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, - 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, - 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, - 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, - 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, - 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, - 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, - 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, - 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, - 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, - 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, - 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, - 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, - 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, - 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, - 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, - 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, - 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, - 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, - 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, - 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, - 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, - 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, - 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, - 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, - 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, - 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, - 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, - 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, - 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, - 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, - 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, - 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, - 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, - 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, - 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, - 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, - 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, - 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, - 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, - 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, - 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, - 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, - 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, - 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, - 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, - 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, - 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, - 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, - 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, - 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, - 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, - 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, - 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, - 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, - 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, - 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, - 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, - 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, - 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, - 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, - 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, - 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, - 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, - 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, - 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, - 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, - 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, - 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, - 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, - 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, - 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, - 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, - 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, - 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, - 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, - 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, - 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, - 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, - 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, - 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, - 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, - 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, - 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, - 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, - 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, - 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, - 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, - 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, - 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, - 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, - 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, - 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, - 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, - 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, - 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, - 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, - 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, - 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, - 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, - 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, - 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, - 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, - 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, - 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, - 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, - 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, - 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, - 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, - 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, - 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, - 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, - 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, - 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, - 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, - 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, - 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, - 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, - 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, - 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, - 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, - 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, - 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, - 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, - 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, - 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, - 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, - 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, - 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, - 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, - 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, - 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, - 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, - 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, - 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, - 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, - 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, - 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, - 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, - 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, - 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 }, -4}}; typedef ConnectComponentsTest ConnectComponentsTestF_Int; -TEST_P(ConnectComponentsTestF_Int, Result) { +TEST_P(ConnectComponentsTestF_Int, Result) +{ /** - * Verify the src & dst vertices on each edge have different colors - */ + * Verify the src & dst vertices on each edge have different colors + */ EXPECT_TRUE(final_edges == params.n_row - 1); } -INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, ConnectComponentsTestF_Int, +INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, + ConnectComponentsTestF_Int, ::testing::ValuesIn(fix_conn_inputsf2)); }; // namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index ea69ecfc53..2e4c2c1a14 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -39,7 +39,8 @@ struct CSRtoCOOInputs { template class CSRtoCOOTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); @@ -48,20 +49,21 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { raft::allocate(result, params.verify.size(), true); } - void Run() { + void Run() + { Index_ n_rows = params.ex_scan.size(); - Index_ nnz = params.verify.size(); + Index_ nnz = params.verify.size(); raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); raft::update_device(verify, params.verify.data(), nnz, stream); convert::csr_to_coo(ex_scan, n_rows, result, nnz, stream); - ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, - raft::Compare(), stream)); + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare(), stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(ex_scan)); CUDA_CHECK(cudaFree(verify)); CUDA_CHECK(cudaFree(result)); @@ -89,9 +91,11 @@ const std::vector> csrtocoo_inputs_64 = { {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, + CSRtoCOOTestI, ::testing::ValuesIn(csrtocoo_inputs_32)); -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, + CSRtoCOOTestL, ::testing::ValuesIn(csrtocoo_inputs_64)); } // namespace sparse diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index 553ef2ddee..b2878081ae 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -37,14 +37,13 @@ struct SparseConvertCSRInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const SparseConvertCSRInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseConvertCSRInputs& dims) +{ return os; } template -class SparseConvertCSRTest - : public ::testing::TestWithParam> { +class SparseConvertCSRTest : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -54,22 +53,21 @@ class SparseConvertCSRTest SparseConvertCSRInputs params; }; -const std::vector> inputsf = { - {5, 10, 5, 1234ULL}}; +const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseConvertCSRTest SortedCOOToCSR; -TEST_P(SortedCOOToCSR, Result) { +TEST_P(SortedCOOToCSR, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + std::shared_ptr alloc(new raft::mr::device::default_allocator); int nnz = 8; int *in, *out, *exp; - int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int *exp_h = new int[4]{0, 2, 4, 6}; + int* in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int* exp_h = new int[4]{0, 2, 4, 6}; raft::allocate(in, nnz, true); raft::allocate(exp, 4, true); @@ -92,8 +90,7 @@ TEST_P(SortedCOOToCSR, Result) { CUDA_CHECK(cudaFree(out)); } -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf)); /******************************** adj graph ********************************/ @@ -107,10 +104,10 @@ struct CSRAdjGraphInputs { }; template -class CSRAdjGraphTest - : public ::testing::TestWithParam> { +class CSRAdjGraphTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); nnz = params.verify.size(); @@ -121,20 +118,21 @@ class CSRAdjGraphTest raft::allocate(verify, nnz); } - void Run() { + void Run() + { raft::update_device(row_ind, params.row_ind.data(), params.n_rows, stream); - raft::update_device(adj, reinterpret_cast(params.adj.data()), - params.n_rows * params.n_cols, stream); + raft::update_device( + adj, reinterpret_cast(params.adj.data()), params.n_rows * params.n_cols, stream); raft::update_device(verify, params.verify.data(), nnz, stream); convert::csr_adj_graph_batched( row_ind, params.n_cols, nnz, params.n_rows, adj, result, stream); - ASSERT_TRUE( - raft::devArrMatch(verify, result, nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(row_ind)); CUDA_CHECK(cudaFree(adj)); CUDA_CHECK(cudaFree(verify)); @@ -147,7 +145,7 @@ class CSRAdjGraphTest cudaStream_t stream; Index_ nnz; Index_ *row_ind, *result, *verify; - bool *adj; + bool* adj; }; using CSRAdjGraphTestI = CSRAdjGraphTest; @@ -171,9 +169,11 @@ const std::vector> csradjgraph_inputs_l = { {0, 1, 2, 0, 1, 2, 0, 1, 2}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, + CSRAdjGraphTestI, ::testing::ValuesIn(csradjgraph_inputs_i)); -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, + CSRAdjGraphTestL, ::testing::ValuesIn(csradjgraph_inputs_l)); } // namespace sparse diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index 625772a842..fe43f0d182 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -47,19 +47,19 @@ struct CSRRowSliceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRRowSliceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRRowSliceInputs& dims) +{ return os; } template -class CSRRowSliceTest - : public ::testing::TestWithParam> { +class CSRRowSliceTest : public ::testing::TestWithParam> { protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -69,31 +69,27 @@ class CSRRowSliceTest update_device(indices, indices_h.data(), indices_h.size(), stream); update_device(data, data_h.data(), data_h.size(), stream); - std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; + std::vector out_data_ref_h = params.out_data_ref_h; allocate(out_indptr_ref, out_indptr_ref_h.size()); allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_data_ref, out_data_ref_h.size()); - update_device(out_indptr_ref, out_indptr_ref_h.data(), - out_indptr_ref_h.size(), stream); - update_device(out_indices_ref, out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), - stream); + update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); + update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); allocate(out_indptr, out_indptr_ref_h.size()); allocate(out_indices, out_indices_ref_h.size()); allocate(out_data, out_data_ref_h.size()); } - void SetUp() override { - params = ::testing::TestWithParam< - CSRRowSliceInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + std::shared_ptr alloc(new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); make_data(); @@ -101,18 +97,22 @@ class CSRRowSliceTest int csr_start_offset; int csr_stop_offset; - raft::sparse::op::csr_row_slice_indptr( - params.start_row, params.stop_row, indptr, out_indptr, &csr_start_offset, - &csr_stop_offset, stream); + raft::sparse::op::csr_row_slice_indptr(params.start_row, + params.stop_row, + indptr, + out_indptr, + &csr_start_offset, + &csr_stop_offset, + stream); - raft::sparse::op::csr_row_slice_populate(csr_start_offset, csr_stop_offset, - indices, data, out_indices, - out_data, stream); + raft::sparse::op::csr_row_slice_populate( + csr_start_offset, csr_stop_offset, indices, data, out_indices, out_data, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -125,15 +125,14 @@ class CSRRowSliceTest CUDA_CHECK(cudaFree(out_data_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref, - params.out_indptr_ref_h.size(), - Compare())); - ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref, - params.out_indices_ref_h.size(), - Compare())); - ASSERT_TRUE(devArrMatch(out_data, out_data_ref, - params.out_data_ref_h.size(), Compare())); + void compare() + { + ASSERT_TRUE( + devArrMatch(out_indptr, out_indptr_ref, params.out_indptr_ref_h.size(), Compare())); + ASSERT_TRUE(devArrMatch( + out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare())); + ASSERT_TRUE( + devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare())); } protected: @@ -141,15 +140,15 @@ class CSRRowSliceTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data value_idx *out_indptr, *out_indices; - value_t *out_data; + value_t* out_data; // expected output data value_idx *out_indptr_ref, *out_indices_ref; - value_t *out_data_ref; + value_t* out_data_ref; CSRRowSliceInputs params; }; @@ -177,8 +176,7 @@ const std::vector> inputs_i32_f = { }; typedef CSRRowSliceTest CSRRowSliceTestF; TEST_P(CSRRowSliceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index 5535df4fe3..286493ada7 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -43,19 +43,19 @@ struct CSRToDenseInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRToDenseInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRToDenseInputs& dims) +{ return os; } template -class CSRToDenseTest - : public ::testing::TestWithParam> { +class CSRToDenseTest : public ::testing::TestWithParam> { protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -74,24 +74,24 @@ class CSRToDenseTest allocate(out, out_ref_h.size()); } - void SetUp() override { - params = ::testing::TestWithParam< - CSRToDenseInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + std::shared_ptr alloc(new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - convert::csr_to_dense(handle, params.nrows, params.ncols, indptr, indices, - data, params.nrows, out, stream, true); + convert::csr_to_dense( + handle, params.nrows, params.ncols, indptr, indices, data, params.nrows, out, stream, true); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -100,9 +100,9 @@ class CSRToDenseTest CUDA_CHECK(cudaFree(out_ref)); } - void compare() { - ASSERT_TRUE( - devArrMatch(out, out_ref, params.out_ref_h.size(), Compare())); + void compare() + { + ASSERT_TRUE(devArrMatch(out, out_ref, params.out_ref_h.size(), Compare())); } protected: @@ -111,13 +111,13 @@ class CSRToDenseTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data - value_t *out; + value_t* out; // expected output data - value_t *out_ref; + value_t* out_ref; CSRToDenseInputs params; }; @@ -128,13 +128,26 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 2, 3, 0, 1, 2, 3}, // indices {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, - {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 16.0f, 2.0f}}, + {1.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 5.0f, + 50.0f, + 28.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 16.0f, + 2.0f}}, }; typedef CSRToDenseTest CSRToDenseTestF; TEST_P(CSRToDenseTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index c257d6eb3c..87b8b17073 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -49,19 +49,19 @@ struct CSRTransposeInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRTransposeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRTransposeInputs& dims) +{ return os; } template -class CSRTransposeTest - : public ::testing::TestWithParam> { +class CSRTransposeTest : public ::testing::TestWithParam> { protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -71,45 +71,51 @@ class CSRTransposeTest update_device(indices, indices_h.data(), indices_h.size(), stream); update_device(data, data_h.data(), data_h.size(), stream); - std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; + std::vector out_data_ref_h = params.out_data_ref_h; allocate(out_indptr_ref, out_indptr_ref_h.size()); allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_data_ref, out_data_ref_h.size()); - update_device(out_indptr_ref, out_indptr_ref_h.data(), - out_indptr_ref_h.size(), stream); - update_device(out_indices_ref, out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), - stream); + update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); + update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); allocate(out_indptr, out_indptr_ref_h.size()); allocate(out_indices, out_indices_ref_h.size()); allocate(out_data, out_data_ref_h.size()); } - void SetUp() override { - params = ::testing::TestWithParam< - CSRTransposeInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + std::shared_ptr alloc(new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - raft::sparse::linalg::csr_transpose( - handle, indptr, indices, data, out_indptr, out_indices, out_data, - params.nrows, params.ncols, params.nnz, alloc, stream); + raft::sparse::linalg::csr_transpose(handle, + indptr, + indices, + data, + out_indptr, + out_indices, + out_data, + params.nrows, + params.ncols, + params.nnz, + alloc, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -122,15 +128,14 @@ class CSRTransposeTest CUDA_CHECK(cudaFree(out_data_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref, - params.out_indptr_ref_h.size(), - Compare())); - ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref, - params.out_indices_ref_h.size(), - Compare())); - ASSERT_TRUE(devArrMatch(out_data, out_data_ref, - params.out_data_ref_h.size(), Compare())); + void compare() + { + ASSERT_TRUE( + devArrMatch(out_indptr, out_indptr_ref, params.out_indptr_ref_h.size(), Compare())); + ASSERT_TRUE(devArrMatch( + out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare())); + ASSERT_TRUE( + devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare())); } protected: @@ -139,15 +144,15 @@ class CSRTransposeTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data value_idx *out_indptr, *out_indices; - value_t *out_data; + value_t* out_data; // expected output data value_idx *out_indptr_ref, *out_indices_ref; - value_t *out_data_ref; + value_t* out_data_ref; CSRTransposeInputs params; }; @@ -167,8 +172,7 @@ const std::vector> inputs_i32_f = { }; typedef CSRTransposeTest CSRTransposeTestF; TEST_P(CSRTransposeTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index 5d687ad92b..c6b2a27273 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -33,8 +33,7 @@ struct SparseDegreeInputs { }; template -class SparseDegreeTests - : public ::testing::TestWithParam> { +class SparseDegreeTests : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -47,11 +46,12 @@ class SparseDegreeTests const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseDegreeTests COODegree; -TEST_P(COODegree, Result) { +TEST_P(COODegree, Result) +{ int *in_rows, *verify, *results; int in_rows_h[5] = {0, 0, 1, 2, 2}; - int verify_h[5] = {2, 1, 2, 0, 0}; + int verify_h[5] = {2, 1, 2, 0, 0}; raft::allocate(in_rows, 5); raft::allocate(verify, 5, true); @@ -70,16 +70,17 @@ TEST_P(COODegree, Result) { } typedef SparseDegreeTests COODegreeNonzero; -TEST_P(COODegreeNonzero, Result) { +TEST_P(COODegreeNonzero, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); int *in_rows, *verify, *results; - float *in_vals; + float* in_vals; - int in_rows_h[5] = {0, 0, 1, 2, 2}; + int in_rows_h[5] = {0, 0, 1, 2, 2}; float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0}; - int verify_h[5] = {1, 0, 2, 0, 0}; + int verify_h[5] = {1, 0, 2, 0, 0}; raft::allocate(in_rows, 5); raft::allocate(verify, 5, true); @@ -101,10 +102,8 @@ TEST_P(COODegreeNonzero, Result) { CUDA_CHECK(cudaStreamDestroy(stream)); } -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, - ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index a83b93f83f..7c0db49a04 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -55,71 +55,82 @@ struct InputConfiguration { }; using dense_smem_strategy_t = dense_smem_strategy; -using hash_strategy_t = hash_strategy; +using hash_strategy_t = hash_strategy; template struct SparseDistanceCOOSPMVInputs { InputConfiguration input_configuration; float capacity_threshold = 0.5; - int map_size = hash_strategy::get_map_size(); + int map_size = hash_strategy::get_map_size(); }; template -::std::ostream &operator<<( - ::std::ostream &os, - const SparseDistanceCOOSPMVInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseDistanceCOOSPMVInputs& dims) +{ return os; } template class SparseDistanceCOOSPMVTest - : public ::testing::TestWithParam< - SparseDistanceCOOSPMVInputs> { + : public ::testing::TestWithParam> { public: SparseDistanceCOOSPMVTest() : dist_config(handle) {} - template > * = nullptr> - U make_strategy() { + template >* = nullptr> + U make_strategy() + { return strategy_t(dist_config, params.capacity_threshold, params.map_size); } - template > * = nullptr> - U make_strategy() { + template >* = nullptr> + U make_strategy() + { return strategy_t(dist_config); } template - void compute_dist(reduce_f reduce_func, accum_f accum_func, - write_f write_func, bool rev = true) { - raft::mr::device::buffer coo_rows( - dist_config.handle.get_device_allocator(), - dist_config.handle.get_stream(), - max(dist_config.b_nnz, dist_config.a_nnz)); - - raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows, - coo_rows.data(), dist_config.b_nnz, + void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true) + { + raft::mr::device::buffer coo_rows(dist_config.handle.get_device_allocator(), + dist_config.handle.get_stream(), + max(dist_config.b_nnz, dist_config.a_nnz)); + + raft::sparse::convert::csr_to_coo(dist_config.b_indptr, + dist_config.b_nrows, + coo_rows.data(), + dist_config.b_nnz, dist_config.handle.get_stream()); strategy_t selected_strategy = make_strategy(); - balanced_coo_pairwise_generalized_spmv( - out_dists, dist_config, coo_rows.data(), reduce_func, accum_func, - write_func, selected_strategy); + balanced_coo_pairwise_generalized_spmv(out_dists, + dist_config, + coo_rows.data(), + reduce_func, + accum_func, + write_func, + selected_strategy); if (rev) { - raft::sparse::convert::csr_to_coo( - dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(), - dist_config.a_nnz, dist_config.handle.get_stream()); - - balanced_coo_pairwise_generalized_spmv_rev( - out_dists, dist_config, coo_rows.data(), reduce_func, accum_func, - write_func, selected_strategy); + raft::sparse::convert::csr_to_coo(dist_config.a_indptr, + dist_config.a_nrows, + coo_rows.data(), + dist_config.a_nnz, + dist_config.handle.get_stream()); + + balanced_coo_pairwise_generalized_spmv_rev(out_dists, + dist_config, + coo_rows.data(), + reduce_func, + accum_func, + write_func, + selected_strategy); } } - void run_spmv() { + void run_spmv() + { switch (params.input_configuration.metric) { case raft::distance::DistanceType::InnerProduct: compute_dist(Product(), Sum(), AtomicAdd(), true); @@ -129,75 +140,69 @@ class SparseDistanceCOOSPMVTest break; case raft::distance::DistanceType::Canberra: compute_dist( - [] __device__(value_t a, value_t b) { - return fabsf(a - b) / (fabsf(a) + fabsf(b)); - }, - Sum(), AtomicAdd()); - break; - case raft::distance::DistanceType::L1: - compute_dist(AbsDiff(), Sum(), AtomicAdd()); - break; - case raft::distance::DistanceType::Linf: - compute_dist(AbsDiff(), Max(), AtomicMax()); + [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); }, + Sum(), + AtomicAdd()); break; + case raft::distance::DistanceType::L1: compute_dist(AbsDiff(), Sum(), AtomicAdd()); break; + case raft::distance::DistanceType::Linf: compute_dist(AbsDiff(), Max(), AtomicMax()); break; case raft::distance::DistanceType::LpUnexpanded: { - compute_dist(PDiff(params.input_configuration.metric_arg), Sum(), - AtomicAdd()); + compute_dist(PDiff(params.input_configuration.metric_arg), Sum(), AtomicAdd()); float p = 1.0f / params.input_configuration.metric_arg; raft::linalg::unaryOp( - out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows, + out_dists, + out_dists, + dist_config.a_nrows * dist_config.b_nrows, [=] __device__(value_t input) { return powf(input, p); }, dist_config.handle.get_stream()); } break; - default: - throw raft::exception("Unknown distance"); + default: throw raft::exception("Unknown distance"); } } protected: - void make_data() { - std::vector indptr_h = params.input_configuration.indptr_h; + void make_data() + { + std::vector indptr_h = params.input_configuration.indptr_h; std::vector indices_h = params.input_configuration.indices_h; - std::vector data_h = params.input_configuration.data_h; + std::vector data_h = params.input_configuration.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); allocate(data, data_h.size()); - update_device(indptr, indptr_h.data(), indptr_h.size(), - handle.get_stream()); - update_device(indices, indices_h.data(), indices_h.size(), - handle.get_stream()); + update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); + update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream()); update_device(data, data_h.data(), data_h.size(), handle.get_stream()); - std::vector out_dists_ref_h = - params.input_configuration.out_dists_ref_h; + std::vector out_dists_ref_h = params.input_configuration.out_dists_ref_h; allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); - update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), - handle.get_stream()); + update_device( + out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); } - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam< SparseDistanceCOOSPMVInputs>::GetParam(); make_data(); - dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.b_ncols = params.input_configuration.n_cols; - dist_config.b_nnz = params.input_configuration.indices_h.size(); - dist_config.b_indptr = indptr; + dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.b_ncols = params.input_configuration.n_cols; + dist_config.b_nnz = params.input_configuration.indices_h.size(); + dist_config.b_indptr = indptr; dist_config.b_indices = indices; - dist_config.b_data = data; - dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.a_ncols = params.input_configuration.n_cols; - dist_config.a_nnz = params.input_configuration.indices_h.size(); - dist_config.a_indptr = indptr; + dist_config.b_data = data; + dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.a_ncols = params.input_configuration.n_cols; + dist_config.a_nnz = params.input_configuration.indices_h.size(); + dist_config.a_indptr = indptr; dist_config.a_indices = indices; - dist_config.a_data = data; + dist_config.a_data = data; int out_size = dist_config.a_nrows * dist_config.b_nrows; @@ -208,7 +213,8 @@ class SparseDistanceCOOSPMVTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -217,8 +223,10 @@ class SparseDistanceCOOSPMVTest CUDA_CHECK(cudaFree(out_dists_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref, + out_dists, params.input_configuration.out_dists_ref_h.size(), CompareApprox(1e-3))); } @@ -228,7 +236,7 @@ class SparseDistanceCOOSPMVTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data value_t *out_dists, *out_dists_ref; @@ -243,8 +251,7 @@ const InputConfiguration input_inner_product = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, - 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}; @@ -275,384 +282,379 @@ const InputConfiguration input_l2_unexpanded = { raft::distance::DistanceType::L2Unexpanded, 0.0}; -const InputConfiguration input_canberra = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 3.3954660629919076, - 5.6469232737388815, - 6.373112846266441, - 4.0212880272531715, - 6.916281504639404, - 5.741508386786526, - 5.411470999663036, - 9.0, - 4.977014354725805, - 3.3954660629919076, - 0.0, - 7.56256082439209, - 5.540261147481582, - 4.832322929216881, - 4.62003193872216, - 6.498056792320361, - 4.309846252268695, - 6.317531174829905, - 6.016362684141827, - 5.6469232737388815, - 7.56256082439209, - 0.0, - 5.974878731322299, - 4.898357301336036, - 6.442097410320605, - 5.227077347287883, - 7.134101195584642, - 5.457753923371659, - 7.0, - 6.373112846266441, - 5.540261147481582, - 5.974878731322299, - 0.0, - 5.5507273748583, - 4.897749658726415, - 9.0, - 8.398776718824767, - 3.908281400328807, - 4.83431066343688, - 4.0212880272531715, - 4.832322929216881, - 4.898357301336036, - 5.5507273748583, - 0.0, - 6.632989819428174, - 7.438852294822894, - 5.6631570310967465, - 7.579428202635459, - 6.760811985364303, - 6.916281504639404, - 4.62003193872216, - 6.442097410320605, - 4.897749658726415, - 6.632989819428174, - 0.0, - 5.249404187382862, - 6.072559523278559, - 4.07661278488929, - 6.19678948003145, - 5.741508386786526, - 6.498056792320361, - 5.227077347287883, - 9.0, - 7.438852294822894, - 5.249404187382862, - 0.0, - 3.854811639654704, - 6.652724827169063, - 5.298236851430971, - 5.411470999663036, - 4.309846252268695, - 7.134101195584642, - 8.398776718824767, - 5.6631570310967465, - 6.072559523278559, - 3.854811639654704, - 0.0, - 7.529184598969917, - 6.903282911791188, - 9.0, - 6.317531174829905, - 5.457753923371659, - 3.908281400328807, - 7.579428202635459, - 4.07661278488929, - 6.652724827169063, - 7.529184598969917, - 0.0, - 7.0, - 4.977014354725805, - 6.016362684141827, - 7.0, - 4.83431066343688, - 6.760811985364303, - 6.19678948003145, - 5.298236851430971, - 6.903282911791188, - 7.0, - 0.0}, - raft::distance::DistanceType::Canberra, - 0.0}; - -const InputConfiguration input_lp_unexpanded = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 1.31462855332296, - 1.3690307816129905, - 1.698603990921237, - 1.3460470789553531, - 1.6636670712582544, - 1.2651744044972217, - 1.1938329352055201, - 1.8811409082590185, - 1.3653115050624267, - 1.31462855332296, - 0.0, - 1.9447722703291133, - 1.42818777206562, - 1.4685491458946494, - 1.3071999866010466, - 1.4988622861692171, - 0.9698559287406783, - 1.4972023224597841, - 1.5243383567266802, - 1.3690307816129905, - 1.9447722703291133, - 0.0, - 1.2748400840107568, - 1.0599569946448246, - 1.546591282841402, - 1.147526531928459, - 1.447002179128145, - 1.5982242387673176, - 1.3112533607072414, - 1.698603990921237, - 1.42818777206562, - 1.2748400840107568, - 0.0, - 1.038121552545461, - 1.011788365364402, - 1.3907391109256988, - 1.3128200942311496, - 1.19595706584447, - 1.3233328139624725, - 1.3460470789553531, - 1.4685491458946494, - 1.0599569946448246, - 1.038121552545461, - 0.0, - 1.3642741698145529, - 1.3493868683808095, - 1.394942694628328, - 1.572881849642552, - 1.380122665319464, - 1.6636670712582544, - 1.3071999866010466, - 1.546591282841402, - 1.011788365364402, - 1.3642741698145529, - 0.0, - 1.018961640373018, - 1.0114394258945634, - 0.8338711034820684, - 1.1247823842299223, - 1.2651744044972217, - 1.4988622861692171, - 1.147526531928459, - 1.3907391109256988, - 1.3493868683808095, - 1.018961640373018, - 0.0, - 0.7701238110357329, - 1.245486437864406, - 0.5551259549534626, - 1.1938329352055201, - 0.9698559287406783, - 1.447002179128145, - 1.3128200942311496, - 1.394942694628328, - 1.0114394258945634, - 0.7701238110357329, - 0.0, - 1.1886800117391216, - 1.0083692448135637, - 1.8811409082590185, - 1.4972023224597841, - 1.5982242387673176, - 1.19595706584447, - 1.572881849642552, - 0.8338711034820684, - 1.245486437864406, - 1.1886800117391216, - 0.0, - 1.3661374102525012, - 1.3653115050624267, - 1.5243383567266802, - 1.3112533607072414, - 1.3233328139624725, - 1.380122665319464, - 1.1247823842299223, - 0.5551259549534626, - 1.0083692448135637, - 1.3661374102525012, - 0.0}, - raft::distance::DistanceType::LpUnexpanded, - 2.0}; - -const InputConfiguration input_linf = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 0.9251771844789913, - 0.9036452083899731, - 0.9251771844789913, - 0.8706483735804971, - 0.9251771844789913, - 0.717493881903289, - 0.6920214832303888, - 0.9251771844789913, - 0.9251771844789913, - 0.9251771844789913, - 0.0, - 0.9036452083899731, - 0.8655339692155823, - 0.8706483735804971, - 0.8655339692155823, - 0.8655339692155823, - 0.6329837991017668, - 0.8655339692155823, - 0.8655339692155823, - 0.9036452083899731, - 0.9036452083899731, - 0.0, - 0.7988276152181608, - 0.7028075145996631, - 0.9036452083899731, - 0.9036452083899731, - 0.9036452083899731, - 0.8429599432532096, - 0.9036452083899731, - 0.9251771844789913, - 0.8655339692155823, - 0.7988276152181608, - 0.0, - 0.48376552205293305, - 0.8206394616536681, - 0.8206394616536681, - 0.8206394616536681, - 0.8429599432532096, - 0.8206394616536681, - 0.8706483735804971, - 0.8706483735804971, - 0.7028075145996631, - 0.48376552205293305, - 0.0, - 0.8706483735804971, - 0.8706483735804971, - 0.8706483735804971, - 0.8429599432532096, - 0.8706483735804971, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.0, - 0.8853924473642432, - 0.535821510936138, - 0.6497196601457607, - 0.8853924473642432, - 0.717493881903289, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.0, - 0.5279604218147174, - 0.6658348373853169, - 0.33799874888632914, - 0.6920214832303888, - 0.6329837991017668, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.535821510936138, - 0.5279604218147174, - 0.0, - 0.662579808115858, - 0.5079750812968089, - 0.9251771844789913, - 0.8655339692155823, - 0.8429599432532096, - 0.8429599432532096, - 0.8429599432532096, - 0.6497196601457607, - 0.6658348373853169, - 0.662579808115858, - 0.0, - 0.8429599432532096, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.33799874888632914, - 0.5079750812968089, - 0.8429599432532096, - 0.0}, - raft::distance::DistanceType::Linf, - 0.0}; - -const InputConfiguration input_l1 = { - 4, - {0, 1, 1, 2, 4}, - {3, 2, 0, 1}, // indices - {0.99296, 0.42180, 0.11687, 0.305869}, - { - // dense output - 0.0, - 0.99296, - 1.41476, - 1.415707, - 0.99296, - 0.0, - 0.42180, - 0.42274, - 1.41476, - 0.42180, - 0.0, - 0.84454, - 1.41570, - 0.42274, - 0.84454, - 0.0, - }, - raft::distance::DistanceType::L1, +const InputConfiguration input_canberra = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + raft::distance::DistanceType::Canberra, 0.0}; +const InputConfiguration input_lp_unexpanded = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + raft::distance::DistanceType::LpUnexpanded, + 2.0}; + +const InputConfiguration input_linf = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + raft::distance::DistanceType::Linf, + 0.0}; + +const InputConfiguration input_l1 = {4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + raft::distance::DistanceType::L1, + 0.0}; + // test dense smem strategy -const std::vector< - SparseDistanceCOOSPMVInputs> - inputs_dense_strategy = {{input_inner_product}, {input_l2_unexpanded}, - {input_canberra}, {input_lp_unexpanded}, - {input_linf}, {input_l1}}; +const std::vector> + inputs_dense_strategy = {{input_inner_product}, + {input_l2_unexpanded}, + {input_canberra}, + {input_lp_unexpanded}, + {input_linf}, + {input_l1}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestDenseStrategyF; @@ -662,22 +664,22 @@ INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests, ::testing::ValuesIn(inputs_dense_strategy)); // test hash and chunk strategy -const std::vector> - inputs_hash_strategy = {{input_inner_product}, - {input_inner_product, 0.5, 2}, - {input_l2_unexpanded}, - {input_l2_unexpanded, 0.5, 2}, - {input_canberra}, - {input_canberra, 0.5, 2}, - {input_canberra, 0.5, 6}, - {input_lp_unexpanded}, - {input_lp_unexpanded, 0.5, 2}, - {input_lp_unexpanded, 0.5, 6}, - {input_linf}, - {input_linf, 0.5, 2}, - {input_linf, 0.5, 6}, - {input_l1}, - {input_l1, 0.5, 2}}; +const std::vector> inputs_hash_strategy = { + {input_inner_product}, + {input_inner_product, 0.5, 2}, + {input_l2_unexpanded}, + {input_l2_unexpanded, 0.5, 2}, + {input_canberra}, + {input_canberra, 0.5, 2}, + {input_canberra, 0.5, 6}, + {input_lp_unexpanded}, + {input_lp_unexpanded, 0.5, 2}, + {input_lp_unexpanded, 0.5, 6}, + {input_linf}, + {input_linf, 0.5, 2}, + {input_linf, 0.5, 6}, + {input_l1}, + {input_l1, 0.5, 2}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestHashStrategyF; diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 0589637061..8d6675f954 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -50,8 +50,8 @@ struct SparseDistanceInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseDistanceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs& dims) +{ return os; } @@ -61,24 +61,24 @@ class SparseDistanceTest public: SparseDistanceTest() : dist_config(handle) {} - void SetUp() override { - params = ::testing::TestWithParam< - SparseDistanceInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); make_data(); - dist_config.b_nrows = params.indptr_h.size() - 1; - dist_config.b_ncols = params.n_cols; - dist_config.b_nnz = params.indices_h.size(); - dist_config.b_indptr = indptr; + dist_config.b_nrows = params.indptr_h.size() - 1; + dist_config.b_ncols = params.n_cols; + dist_config.b_nnz = params.indices_h.size(); + dist_config.b_indptr = indptr; dist_config.b_indices = indices; - dist_config.b_data = data; - dist_config.a_nrows = params.indptr_h.size() - 1; - dist_config.a_ncols = params.n_cols; - dist_config.a_nnz = params.indices_h.size(); - dist_config.a_indptr = indptr; + dist_config.b_data = data; + dist_config.a_nrows = params.indptr_h.size() - 1; + dist_config.a_ncols = params.n_cols; + dist_config.a_nnz = params.indices_h.size(); + dist_config.a_indptr = indptr; dist_config.a_indices = indices; - dist_config.a_data = data; + dist_config.a_data = data; int out_size = dist_config.a_nrows * dist_config.b_nrows; @@ -89,7 +89,8 @@ class SparseDistanceTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -98,33 +99,34 @@ class SparseDistanceTest CUDA_CHECK(cudaFree(out_dists_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, - params.out_dists_ref_h.size(), - CompareApprox(1e-3))); + void compare() + { + ASSERT_TRUE(devArrMatch( + out_dists_ref, out_dists, params.out_dists_ref_h.size(), CompareApprox(1e-3))); } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); allocate(data, data_h.size()); - update_device(indptr, indptr_h.data(), indptr_h.size(), - handle.get_stream()); - update_device(indices, indices_h.data(), indices_h.size(), - handle.get_stream()); + update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); + update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream()); update_device(data, data_h.data(), data_h.size(), handle.get_stream()); std::vector out_dists_ref_h = params.out_dists_ref_h; allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); - update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + update_device(out_dists_ref, + out_dists_ref_h.data(), + out_dists_ref_h.size(), dist_config.handle.get_stream()); } @@ -132,7 +134,7 @@ class SparseDistanceTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data value_t *out_dists, *out_dists_ref; @@ -187,8 +189,7 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, - 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}, {2, @@ -219,40 +220,33 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, - 0.58146987, 0.44940102, 1., 0.76978799, 0.39419924, 0., - 0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481, - 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, - 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., - 0.79593037, 0.48904013, 0.51413997, 0., 0.28605559, 0.35772784, - 1., 0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801, - 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, - 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, - 0.58623212, 0., 0.77917274, 0.48390993, 0.24558392, 0.99166225, - 0.58146987, 0.73323749, 0.67534399, 1., 0.6745457, 0.77917274, - 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, - 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., - 0.51360432, 0.68185144, 1., 0.54847744, 0.8321819, 0.43324829, - 0.67676228, 0.24558392, 0.76064776, 0.51360432, 0., 1., - 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102, + 1., 0.76978799, 0.39419924, 0., 0.97577154, 0.48904013, 0.48300801, 0.45087445, + 0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, + 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., 0.79593037, 0.48904013, + 0.51413997, 0., 0.28605559, 0.35772784, 1., 0.60889396, 0.43324829, 0.84923694, + 0.45658883, 0.48300801, 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, + 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0., + 0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1., + 0.6745457, 0.77917274, 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, + 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., 0.51360432, 0.68185144, + 1., 0.54847744, 0.8321819, 0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432, + 0., 1., 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, 0.61547536, 0.68185144, 1., 0.}, raft::distance::DistanceType::CosineExpanded, 0.0}, {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, @@ -361,15 +355,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 3.3954660629919076, 5.6469232737388815, @@ -475,15 +467,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 1.31462855332296, 1.3690307816129905, @@ -589,15 +579,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 0.9251771844789913, 0.9036452083899731, @@ -703,17 +691,14 @@ const std::vector> inputs_i32_f = { {15, {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45}, - {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, - 0, 3, 7, 8, 12, 0, 2, 5, 7, 8, 14, 4, 9, 10, 11, - 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, - {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, - 0.73789274, 0.08450219, 1., 0.20184723, 0.18036963, 0.12581403, - 0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555, - 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, - 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, - 0.15605804, 0.3867739, 0.24908977, 0.36413632, 0.37643732, 0.28910679, - 0.0198409, 0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969, - 0.26190054, 0.2077349, 0.10803964}, + {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, 0, 3, 7, 8, 12, 0, 2, 5, + 7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, + {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219, + 1., 0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246, + 0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, + 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739, + 0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409, 0.31461499, 0.24412279, 0.08327667, + 0.04444576, 0.05047969, 0.26190054, 0.2077349, 0.10803964}, {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01, 9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00, 6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08, @@ -772,31 +757,25 @@ const std::vector> inputs_i32_f = { {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45}, {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4}, - {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, - 0.99584118, 0.76835667, 0.34426657, 0.2357925, 0.01274851, 0.11422017, - 0.3437756, 0.31967718, 0.5956055, 0.31610373, 0.04147273, 0.03724415, - 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, - 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, - 0.61364678, 0.22837736, 0.56609561, 0.29809423, 0.76736686, 0.56460608, - 0.98165371, 0.02140123, 0.19881268, 0.26057815, 0.31648823, 0.89874295, - 0.27366735, 0.5119944, 0.11416134}, + {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667, + 0.34426657, 0.2357925, 0.01274851, 0.11422017, 0.3437756, 0.31967718, 0.5956055, 0.31610373, + 0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, + 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736, + 0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815, + 0.31648823, 0.89874295, 0.27366735, 0.5119944, 0.11416134}, {// dense output - 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, - 0.76962708, 1.122858, 1.1232498, 1.08166081, 0.48769777, 0., - 1.31332116, 0.98318907, 0.42661815, 0.09279052, 1.35187836, 1.38429055, - 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, - 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, - 0.26127048, 0.98318907, 1.82943642, 0., 0.29945563, 1.08494093, - 0.22934281, 0.82801925, 1.74288748, 1.50610116, 0.26657011, 0.42661815, - 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, - 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, - 0.45060069, 0., 1.29899154, 1.40683824, 0.48505269, 0.53862363, - 0.76962708, 1.35187836, 1.59360067, 0.22934281, 0.77814948, 1.29899154, - 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, - 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., - 1.47318624, 1.92660889, 1.1232498, 0.40658897, 0.60215168, 1.74288748, - 1.18328348, 0.48505269, 1.92108999, 1.47318624, 0., 0.24992619, - 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, + 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, 0.76962708, 1.122858, + 1.1232498, 1.08166081, 0.48769777, 0., 1.31332116, 0.98318907, 0.42661815, 0.09279052, + 1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, + 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907, + 1.82943642, 0., 0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116, + 0.26657011, 0.42661815, 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, + 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, 0.45060069, 0., + 1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281, + 0.77814948, 1.29899154, 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, + 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., 1.47318624, 1.92660889, + 1.1232498, 0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624, + 0., 0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, 1.88812175, 1.92660889, 0.24992619, 0.}, raft::distance::DistanceType::CorrelationExpanded, 0.0}, @@ -805,12 +784,11 @@ const std::vector> inputs_i32_f = { {1, 4, 0, 4, 1, 3, 0, 1, 3, 0}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., - 1., 1., 1., 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., - 1., 1., 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., - 1., 1., 1., 1., 0., 1., 0.8, 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., - 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, + 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., 1., 1., 1., + 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., + 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.8, 1., 1., + 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, raft::distance::DistanceType::RusselRaoExpanded, 0.0}, {5, @@ -818,13 +796,12 @@ const std::vector> inputs_i32_f = { {0, 3, 4, 4, 2, 3, 0, 2, 3, 2}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, - 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, - 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., - 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., - 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, - 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, 0.2, 0.2, 0.4, 0., 0.2, - 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, + 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, + 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0., 0.4, 0., + 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, + 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, + 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, + 0.2, 0.2, 0.4, 0., 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, raft::distance::DistanceType::HammingUnexpanded, 0.0}, {3, @@ -868,7 +845,8 @@ const std::vector> inputs_i32_f = { typedef SparseDistanceTest SparseDistanceTestF; TEST_P(SparseDistanceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF, +INSTANTIATE_TEST_CASE_P(SparseDistanceTests, + SparseDistanceTestF, ::testing::ValuesIn(inputs_i32_f)); }; // namespace distance diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index f7954f899f..02be95c8a8 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -36,8 +36,7 @@ struct SparseFilterInputs { }; template -class SparseFilterTests - : public ::testing::TestWithParam> { +class SparseFilterTests : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -50,14 +49,14 @@ class SparseFilterTests const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseFilterTests COORemoveZeros; -TEST_P(COORemoveZeros, Result) { +TEST_P(COORemoveZeros, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + std::shared_ptr alloc(new raft::mr::device::default_allocator); params = ::testing::TestWithParam>::GetParam(); - float *in_h_vals = new float[params.nnz]; + float* in_h_vals = new float[params.nnz]; COO in(alloc, stream, params.nnz, 5, 5); @@ -70,8 +69,8 @@ TEST_P(COORemoveZeros, Result) { in_h_vals[2] = 0; in_h_vals[3] = 0; - int *in_h_rows = new int[params.nnz]; - int *in_h_cols = new int[params.nnz]; + int* in_h_rows = new int[params.nnz]; + int* in_h_cols = new int[params.nnz]; for (int i = 0; i < params.nnz; i++) { in_h_rows[i] = params.nnz - i - 1; @@ -87,9 +86,9 @@ TEST_P(COORemoveZeros, Result) { int out_rows_ref_h[2] = {0, 3}; int out_cols_ref_h[2] = {4, 1}; - float *out_vals_ref_h = (float *)malloc(2 * sizeof(float)); - out_vals_ref_h[0] = in_h_vals[4]; - out_vals_ref_h[1] = in_h_vals[1]; + float* out_vals_ref_h = (float*)malloc(2 * sizeof(float)); + out_vals_ref_h[0] = in_h_vals[4]; + out_vals_ref_h[1] = in_h_vals[1]; COO out_ref(alloc, stream, 2, 5, 5); COO out(alloc, stream); @@ -100,12 +99,9 @@ TEST_P(COORemoveZeros, Result) { op::coo_remove_zeros<32, float>(&in, &out, alloc, stream); - ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, raft::Compare())); CUDA_CHECK(cudaStreamDestroy(stream)); free(out_vals_ref_h); @@ -115,8 +111,7 @@ TEST_P(COORemoveZeros, Result) { delete[] in_h_vals; } -INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 8c3bf36318..ca9da0bc05 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -50,39 +50,53 @@ struct SparseKNNInputs { int batch_size_index = 2; int batch_size_query = 2; - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded; + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded; }; template -::std::ostream &operator<<(::std::ostream &os, - const SparseKNNInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs& dims) +{ return os; } template -class SparseKNNTest - : public ::testing::TestWithParam> { +class SparseKNNTest : public ::testing::TestWithParam> { public: - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); n_rows = params.indptr_h.size() - 1; - nnz = params.indices_h.size(); - k = params.k; + nnz = params.indices_h.size(); + k = params.k; make_data(); - raft::sparse::selection::brute_force_knn( - indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data, - nnz, n_rows, params.n_cols, out_indices, out_dists, k, handle, - params.batch_size_index, params.batch_size_query, params.metric); + raft::sparse::selection::brute_force_knn(indptr, + indices, + data, + nnz, + n_rows, + params.n_cols, + indptr, + indices, + data, + nnz, + n_rows, + params.n_cols, + out_indices, + out_dists, + k, + handle, + params.batch_size_index, + params.batch_size_query, + params.metric); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); CUDA_CHECK(cudaFree(data)); @@ -92,39 +106,37 @@ class SparseKNNTest CUDA_CHECK(cudaFree(out_dists_ref)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, - CompareApprox(1e-4))); - ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, - Compare())); + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, CompareApprox(1e-4))); + ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare())); } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); allocate(data, data_h.size()); - update_device(indptr, indptr_h.data(), indptr_h.size(), - handle.get_stream()); - update_device(indices, indices_h.data(), indices_h.size(), - handle.get_stream()); + update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); + update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream()); update_device(data, data_h.data(), data_h.size(), handle.get_stream()); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_dists_ref, out_dists_ref_h.size()); - update_device(out_indices_ref, out_indices_ref_h.data(), - out_indices_ref_h.size(), handle.get_stream()); - update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), - handle.get_stream()); + update_device( + out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), handle.get_stream()); + update_device( + out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); allocate(out_dists, n_rows * k); allocate(out_indices, n_rows * k); @@ -136,14 +148,14 @@ class SparseKNNTest // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; // output data - value_idx *out_indices; - value_t *out_dists; + value_idx* out_indices; + value_t* out_dists; - value_idx *out_indices_ref; - value_t *out_dists_ref; + value_idx* out_indices_ref; + value_t* out_dists_ref; SparseKNNInputs params; }; @@ -161,8 +173,7 @@ const std::vector> inputs_i32_f = { raft::distance::DistanceType::L2SqrtExpanded}}; typedef SparseKNNTest SparseKNNTestF; TEST_P(SparseKNNTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection }; // end namespace sparse diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index ec41b32374..f660e68aa3 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -29,8 +29,9 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, - value_idx nnz, value_idx *sum) { +__global__ void assert_symmetry( + value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) +{ int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -50,22 +51,21 @@ struct KNNGraphInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const KNNGraphInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const KNNGraphInputs& dims) +{ return os; } template -class KNNGraphTest - : public ::testing::TestWithParam> { - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); +class KNNGraphTest : public ::testing::TestWithParam> { + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; auto alloc = handle.get_device_allocator(); - stream = handle.get_stream(); + stream = handle.get_stream(); out = new raft::sparse::COO(alloc, stream); @@ -74,8 +74,7 @@ class KNNGraphTest update_device(X, params.X.data(), params.X.size(), stream); raft::sparse::selection::knn_graph( - handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded, - *out); + handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out); rmm::device_uvector sum(1, stream); @@ -91,7 +90,8 @@ class KNNGraphTest CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(X)); delete out; @@ -101,9 +101,9 @@ class KNNGraphTest cudaStream_t stream; // input data - raft::sparse::COO *out; + raft::sparse::COO* out; - value_t *X; + value_t* X; value_idx sum_h; @@ -115,13 +115,15 @@ const std::vector> knn_graph_inputs_fint = { {4, 2, {0, 100, 0.01, 0.02, 5000, 10000, -5, -2}, 2}}; typedef KNNGraphTest KNNGraphTestF_int; -TEST_P(KNNGraphTestF_int, Result) { +TEST_P(KNNGraphTestF_int, Result) +{ // nnz should not be larger than twice m * k ASSERT_TRUE(out->nnz <= (params.m * params.k * 2)); ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(KNNGraphTest, KNNGraphTestF_int, +INSTANTIATE_TEST_CASE_P(KNNGraphTest, + KNNGraphTestF_int, ::testing::ValuesIn(knn_graph_inputs_fint)); } // namespace sparse diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index ce567e4298..0ca7cec4e9 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -55,45 +55,44 @@ struct LinkageInputs { * @param b: number of pairs of points that both the clusters have classified differently */ template -__global__ void computeTheNumerator(const T* firstClusterArray, - const T* secondClusterArray, uint64_t size, - uint64_t* a, uint64_t* b) { - //calculating the indices of pairs of datapoints compared by the current thread +__global__ void computeTheNumerator( + const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b) +{ + // calculating the indices of pairs of datapoints compared by the current thread uint64_t j = threadIdx.x + blockIdx.x * blockDim.x; uint64_t i = threadIdx.y + blockIdx.y * blockDim.y; - //thread-local variables to count a and b + // thread-local variables to count a and b uint64_t myA = 0, myB = 0; if (i < size && j < size && j < i) { - //checking if the pair have been classified the same by both the clusters + // checking if the pair have been classified the same by both the clusters if (firstClusterArray[i] == firstClusterArray[j] && secondClusterArray[i] == secondClusterArray[j]) { ++myA; } - //checking if the pair have been classified differently by both the clusters + // checking if the pair have been classified differently by both the clusters else if (firstClusterArray[i] != firstClusterArray[j] && secondClusterArray[i] != secondClusterArray[j]) { ++myB; } } - //specialize blockReduce for a 2D block of 1024 threads of type uint64_t - typedef cub::BlockReduce + // specialize blockReduce for a 2D block of 1024 threads of type uint64_t + typedef cub::BlockReduce BlockReduce; - //Allocate shared memory for blockReduce + // Allocate shared memory for blockReduce __shared__ typename BlockReduce::TempStorage temp_storage; - //summing up thread-local counts specific to a block + // summing up thread-local counts specific to a block myA = BlockReduce(temp_storage).Sum(myA); __syncthreads(); myB = BlockReduce(temp_storage).Sum(myB); __syncthreads(); - //executed once per block + // executed once per block if (threadIdx.x == 0 && threadIdx.y == 0) { raft::myAtomicAdd((unsigned long long int*)a, myA); raft::myAtomicAdd((unsigned long long int*)b, myB); @@ -101,102 +100,105 @@ __global__ void computeTheNumerator(const T* firstClusterArray, } /** -* @brief Function to calculate RandIndex -* more info on rand index -* @param firstClusterArray: the array of classes of type T -* @param secondClusterArray: the array of classes of type T -* @param size: the size of the data points of type uint64_t -* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr -* @param stream: the cudaStream object -*/ + * @brief Function to calculate RandIndex + * more info on rand index + * @param firstClusterArray: the array of classes of type T + * @param secondClusterArray: the array of classes of type T + * @param size: the size of the data points of type uint64_t + * @param allocator: object that takes care of temporary device memory allocation of type + * std::shared_ptr + * @param stream: the cudaStream object + */ template -double compute_rand_index( - T* firstClusterArray, T* secondClusterArray, uint64_t size, - std::shared_ptr allocator, cudaStream_t stream) { - //rand index for size less than 2 is not defined +double compute_rand_index(T* firstClusterArray, + T* secondClusterArray, + uint64_t size, + std::shared_ptr allocator, + cudaStream_t stream) +{ + // rand index for size less than 2 is not defined ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); - //allocating and initializing memory for a and b in the GPU + // allocating and initializing memory for a and b in the GPU raft::mr::device::buffer arr_buf(allocator, stream, 2); CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); - //kernel configuration + // kernel configuration static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 numBlocks(raft::ceildiv(size, numThreadsPerBlock.x), raft::ceildiv(size, numThreadsPerBlock.y)); - //calling the kernel - computeTheNumerator - <<>>( - firstClusterArray, secondClusterArray, size, arr_buf.data(), - arr_buf.data() + 1); + // calling the kernel + computeTheNumerator<<>>( + firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1); - //synchronizing and updating the calculated values of a and b from device to host + // synchronizing and updating the calculated values of a and b from device to host uint64_t ab_host[2] = {0}; raft::update_host(ab_host, arr_buf.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - //error handling + // error handling CUDA_CHECK(cudaGetLastError()); - //denominator + // denominator uint64_t nChooseTwo = size * (size - 1) / 2; - //calculating the rand_index + // calculating the rand_index return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo); } template -::std::ostream& operator<<(::std::ostream& os, - const LinkageInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const LinkageInputs& dims) +{ return os; } template class LinkageTest : public ::testing::TestWithParam> { protected: - void basicTest() { + void basicTest() + { raft::handle_t handle; params = ::testing::TestWithParam>::GetParam(); - rmm::device_uvector data(params.n_row * params.n_col, - handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); // Allocate result labels and expected labels on device raft::allocate(labels, params.n_row); raft::allocate(labels_ref, params.n_row); - raft::copy(data.data(), params.data.data(), data.size(), - handle.get_stream()); - raft::copy(labels_ref, params.expected_labels.data(), params.n_row, - handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream()); + raft::copy(labels_ref, params.expected_labels.data(), params.n_row, handle.get_stream()); raft::hierarchy::linkage_output out_arrs; out_arrs.labels = labels; - rmm::device_uvector out_children(params.n_row * 2, - handle.get_stream()); + rmm::device_uvector out_children(params.n_row * 2, handle.get_stream()); out_arrs.children = out_children.data(); - raft::hierarchy::single_linkage< - IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>( - handle, data.data(), params.n_row, params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c, + raft::hierarchy::single_linkage( + handle, + data.data(), + params.n_row, + params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, + &out_arrs, + params.c, params.n_clusters); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - score = - compute_rand_index(labels, labels_ref, params.n_row, - handle.get_device_allocator(), handle.get_stream()); + score = compute_rand_index( + labels, labels_ref, params.n_row, handle.get_device_allocator(), handle.get_stream()); } void SetUp() override { basicTest(); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(labels)); CUDA_CHECK(cudaFree(labels_ref)); } @@ -212,14 +214,12 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10, @@ -227,8 +227,7 @@ const std::vector> linkage_inputsf2 = { // // Test outlier points {9, 2, - {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, - 10, 50, 30, 5}, + {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5}, {6, 0, 5, 0, 0, 4, 3, 2, 1}, 7, -1}, @@ -236,14 +235,12 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == (n_points / 2) {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {1, 0, 4, 0, 0, 3, 2, 0, 2, 1}, 5, @@ -252,340 +249,173 @@ const std::vector> linkage_inputsf2 = { // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, - 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, - 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, - 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, - 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, - 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, - 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, - 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, - 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, - 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, - 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, - 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, - 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, - 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, - 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, - 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, - 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, - 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, - 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, - 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, - 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, - 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, - 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, - 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, - 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, - 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, - 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, - 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, - 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, - 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, - 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, - 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, - 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, - 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, - 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, - 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, - 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, - 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, - 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, - 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, - 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, - 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, - 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, - 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, - 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, - 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, - 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, - 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, - 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, - 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, - 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, - 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, - 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, - 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, - 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, - 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, - 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, - 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, - 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, - 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, - 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, - 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, - 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, - 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, - 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, - 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, - 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, - 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, - 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, - 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, - 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, - 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, - 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, - 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, - 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, - 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, - 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, - 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, - 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, - 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, - 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, - 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, - 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, - 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, - 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, - 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, - 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, - 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, - 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, - 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, - 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, - 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, - 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, - 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, - 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, - 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, - 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, - 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, - 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, - 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, - 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, - 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, - 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, - 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, - 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, - 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, - 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, - 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, - 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, - 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, - 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, - 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, - 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, - 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, - 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, - 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, - 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, - 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, - 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, - 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, - 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, - 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, - 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, - 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, - 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, - 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, - 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, - 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, - 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, - 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, - 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, - 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, - 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, - 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, - 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, - 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, - 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, - 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, - 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, - 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, - 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, - 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, - 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, - 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, - 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, - 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, - 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, - 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, - 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, - 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, - 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, - 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, - 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, - 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, - 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, - 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, - 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, - 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, - 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, - 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, - 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, - 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, - 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, - 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, - 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, - 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, - 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, - 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 }, {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -598,6 +428,5 @@ const std::vector> linkage_inputsf2 = { typedef LinkageTest LinkageTestF_Int; TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); } -INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, - ::testing::ValuesIn(linkage_inputsf2)); +INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, ::testing::ValuesIn(linkage_inputsf2)); } // end namespace raft diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index 7adbbf8b9a..4897d8194b 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -39,12 +39,11 @@ struct CSRRowNormalizeInputs { }; template -class CSRRowNormalizeTest - : public ::testing::TestWithParam> { +class CSRRowNormalizeTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - CSRRowNormalizeInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); raft::allocate(in_vals, params.in_vals.size()); @@ -53,9 +52,10 @@ class CSRRowNormalizeTest raft::allocate(result, params.verify.size(), true); } - void Run() { + void Run() + { Index_ n_rows = params.ex_scan.size(); - Index_ nnz = params.in_vals.size(); + Index_ nnz = params.in_vals.size(); raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); raft::update_device(in_vals, params.in_vals.data(), nnz, stream); @@ -63,20 +63,18 @@ class CSRRowNormalizeTest switch (params.method) { case MAX: - linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows, - result, stream); + linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows, result, stream); break; case L1: - linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows, - result, stream); + linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows, result, stream); break; } - ASSERT_TRUE( - raft::devArrMatch(verify, result, nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(ex_scan)); CUDA_CHECK(cudaFree(in_vals)); CUDA_CHECK(cudaFree(verify)); @@ -87,7 +85,7 @@ class CSRRowNormalizeTest protected: CSRRowNormalizeInputs params; cudaStream_t stream; - Index_ *ex_scan; + Index_* ex_scan; Type_f *in_vals, *result, *verify; }; @@ -118,9 +116,11 @@ const std::vector> csrnormalize_inputs_d = { {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF, +INSTANTIATE_TEST_CASE_P(SparseNormTest, + CSRRowNormalizeTestF, ::testing::ValuesIn(csrnormalize_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD, +INSTANTIATE_TEST_CASE_P(SparseNormTest, + CSRRowNormalizeTestD, ::testing::ValuesIn(csrnormalize_inputs_d)); } // namespace sparse diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu index 50b5dc5993..44098214d2 100644 --- a/cpp/test/sparse/reduce.cu +++ b/cpp/test/sparse/reduce.cu @@ -42,19 +42,19 @@ struct SparseReduceInputs { }; template -class SparseReduceTest - : public ::testing::TestWithParam> { +class SparseReduceTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - SparseReduceInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); } - void Run() { + void Run() + { raft::handle_t handle; auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); rmm::device_uvector in_rows(params.in_rows.size(), stream); rmm::device_uvector in_cols(params.in_cols.size(), stream); @@ -63,30 +63,29 @@ class SparseReduceTest rmm::device_uvector out_cols(params.out_cols.size(), stream); rmm::device_uvector out_vals(params.out_vals.size(), stream); - raft::update_device(in_rows.data(), params.in_rows.data(), - params.in_rows.size(), stream); - raft::update_device(in_cols.data(), params.in_cols.data(), - params.in_cols.size(), stream); - raft::update_device(in_vals.data(), params.in_vals.data(), - params.in_vals.size(), stream); - raft::update_device(out_rows.data(), params.out_rows.data(), - params.out_rows.size(), stream); - raft::update_device(out_cols.data(), params.out_cols.data(), - params.out_cols.size(), stream); - raft::update_device(out_vals.data(), params.out_vals.data(), - params.out_vals.size(), stream); + raft::update_device(in_rows.data(), params.in_rows.data(), params.in_rows.size(), stream); + raft::update_device(in_cols.data(), params.in_cols.data(), params.in_cols.size(), stream); + raft::update_device(in_vals.data(), params.in_vals.data(), params.in_vals.size(), stream); + raft::update_device(out_rows.data(), params.out_rows.data(), params.out_rows.size(), stream); + raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream); + raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream); raft::sparse::COO out(d_alloc, stream); - raft::sparse::op::max_duplicates(handle, out, in_rows.data(), - in_cols.data(), in_vals.data(), - params.in_rows.size(), params.m, params.n); + raft::sparse::op::max_duplicates(handle, + out, + in_rows.data(), + in_cols.data(), + in_vals.data(), + params.in_rows.size(), + params.m, + params.n); ASSERT_TRUE(raft::devArrMatch( out_rows.data(), out.rows(), out.nnz, raft::Compare())); ASSERT_TRUE(raft::devArrMatch( out_cols.data(), out.cols(), out.nnz, raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, raft::Compare())); } void TearDown() override {} @@ -115,7 +114,8 @@ const std::vector> max_reduce_inputs_f = { 4}, }; -INSTANTIATE_TEST_CASE_P(SparseReduceTest, SparseReduceTestF, +INSTANTIATE_TEST_CASE_P(SparseReduceTest, + SparseReduceTestF, ::testing::ValuesIn(max_reduce_inputs_f)); } // namespace sparse diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index b64fa25883..feefa7baa3 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -38,43 +38,47 @@ struct CSRRowOpInputs { /** Wrapper to call csr_row_op because the enclosing function of a __device__ * lambda cannot have private ot protected access within the class. */ template -void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz, - Type_f *result, cudaStream_t stream) { +void csr_row_op_wrapper( + const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream) +{ op::csr_row_op( - row_ind, n_rows, nnz, + row_ind, + n_rows, + nnz, [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) { - for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row; + for (Index_ i = start_idx; i < stop_idx; i++) + result[i] = row; }, stream); } template -class CSRRowOpTest - : public ::testing::TestWithParam> { +class CSRRowOpTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = - ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); n_rows = params.ex_scan.size(); - nnz = params.verify.size(); + nnz = params.verify.size(); raft::allocate(verify, nnz); raft::allocate(ex_scan, n_rows); raft::allocate(result, nnz, true); } - void Run() { + void Run() + { raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); raft::update_device(verify, params.verify.data(), nnz, stream); csr_row_op_wrapper(ex_scan, n_rows, nnz, result, stream); - ASSERT_TRUE( - raft::devArrMatch(verify, result, nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare())); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(ex_scan)); CUDA_CHECK(cudaFree(verify)); CUDA_CHECK(cudaFree(result)); @@ -85,7 +89,7 @@ class CSRRowOpTest CSRRowOpInputs params; cudaStream_t stream; Index_ n_rows, nnz; - Index_ *ex_scan; + Index_* ex_scan; Type_f *result, *verify; }; @@ -102,10 +106,8 @@ const std::vector> csrrowop_inputs_d = { {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, - ::testing::ValuesIn(csrrowop_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, - ::testing::ValuesIn(csrrowop_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, ::testing::ValuesIn(csrrowop_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, ::testing::ValuesIn(csrrowop_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu index 46f2f6a844..5d3b2a8317 100644 --- a/cpp/test/sparse/selection.cu +++ b/cpp/test/sparse/selection.cu @@ -45,8 +45,9 @@ struct SparseSelectionInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseSelectionInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseSelectionInputs& dims) +{ return os; } @@ -54,7 +55,8 @@ template class SparseSelectionTest : public ::testing::TestWithParam> { protected: - void make_data() { + void make_data() + { std::vector dists_h = params.dists_h; allocate(dists, n_rows * n_cols); @@ -63,42 +65,39 @@ class SparseSelectionTest allocate(inds, n_rows * n_cols); iota_fill(inds, n_rows, n_cols, stream); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_dists_ref, out_dists_ref_h.size()); - update_device(out_indices_ref, out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), - stream); + update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), stream); allocate(out_dists, n_rows * k); allocate(out_indices, n_rows * k); } - void SetUp() override { - params = ::testing::TestWithParam< - SparseSelectionInputs>::GetParam(); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); + std::shared_ptr alloc(new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); n_rows = params.n_rows; n_cols = params.n_cols; - k = params.k; + k = params.k; make_data(); - raft::sparse::selection::select_k(dists, inds, n_rows, n_cols, out_dists, - out_indices, params.select_min, k, - stream); + raft::sparse::selection::select_k( + dists, inds, n_rows, n_cols, out_dists, out_indices, params.select_min, k, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(dists)); @@ -111,11 +110,10 @@ class SparseSelectionTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void compare() { - ASSERT_TRUE( - devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare())); - ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, - Compare())); + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare())); + ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare())); } protected: @@ -124,15 +122,15 @@ class SparseSelectionTest int n_rows, n_cols, k; // input data - value_t *dists; - value_idx *inds; + value_t* dists; + value_idx* inds; // output data - value_idx *out_indices; - value_t *out_dists; + value_idx* out_indices; + value_t* out_dists; - value_idx *out_indices_ref; - value_t *out_dists_ref; + value_idx* out_indices_ref; + value_t* out_dists_ref; SparseSelectionInputs params; }; @@ -149,7 +147,8 @@ const std::vector> inputs_i32_f = { true}}; typedef SparseSelectionTest SparseSelectionTestF; TEST_P(SparseSelectionTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF, +INSTANTIATE_TEST_CASE_P(SparseSelectionTest, + SparseSelectionTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index b9a8b849eb..e154d19d34 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -47,27 +47,27 @@ class SparseSortTest : public ::testing::TestWithParam> { const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseSortTest COOSort; -TEST_P(COOSort, Result) { +TEST_P(COOSort, Result) +{ int *in_rows, *in_cols, *verify; - float *in_vals; + float* in_vals; params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr alloc( - new raft::mr::device::default_allocator); + std::shared_ptr alloc(new raft::mr::device::default_allocator); raft::allocate(in_vals, params.nnz); r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream); - int *in_rows_h = (int *)malloc(params.nnz * sizeof(int)); - int *in_cols_h = (int *)malloc(params.nnz * sizeof(int)); - int *verify_h = (int *)malloc(params.nnz * sizeof(int)); + int* in_rows_h = (int*)malloc(params.nnz * sizeof(int)); + int* in_cols_h = (int*)malloc(params.nnz * sizeof(int)); + int* verify_h = (int*)malloc(params.nnz * sizeof(int)); for (int i = 0; i < params.nnz; i++) { in_rows_h[i] = params.nnz - i - 1; - verify_h[i] = i; + verify_h[i] = i; in_cols_h[i] = i; } @@ -80,11 +80,9 @@ TEST_P(COOSort, Result) { raft::update_device(in_cols, in_cols_h, params.nnz, stream); raft::update_device(verify, verify_h, params.nnz, stream); - op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, - stream); + op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, stream); - ASSERT_TRUE( - raft::devArrMatch(verify, in_rows, params.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify, in_rows, params.nnz, raft::Compare())); delete[] in_rows_h; delete[] in_cols_h; diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index d104028d2b..6a66daa769 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -29,8 +29,9 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, - value_idx nnz, value_idx *sum) { +__global__ void assert_symmetry( + value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) +{ int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -49,19 +50,21 @@ struct SparseSymmetrizeInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseSymmetrizeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseSymmetrizeInputs& dims) +{ return os; } template -class SparseSymmetrizeTest : public ::testing::TestWithParam< - SparseSymmetrizeInputs> { +class SparseSymmetrizeTest + : public ::testing::TestWithParam> { protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -72,19 +75,19 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< update_device(data, data_h.data(), data_h.size(), stream); } - void SetUp() override { - params = ::testing::TestWithParam< - SparseSymmetrizeInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; auto alloc = handle.get_device_allocator(); - stream = handle.get_stream(); + stream = handle.get_stream(); make_data(); - value_idx m = params.indptr_h.size() - 1; - value_idx n = params.n_cols; + value_idx m = params.indptr_h.size() - 1; + value_idx n = params.n_cols; value_idx nnz = params.indices_h.size(); raft::mr::device::buffer coo_rows(alloc, stream, nnz); @@ -93,8 +96,8 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< raft::sparse::COO out(alloc, stream); - raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m, - n, coo_rows.size(), out); + raft::sparse::linalg::symmetrize( + handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out); raft::mr::device::buffer sum(alloc, stream, 1); @@ -107,7 +110,8 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -119,7 +123,7 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< // input data value_idx *indptr, *indices; - value_t *data; + value_t* data; value_idx sum_h; @@ -133,8 +137,7 @@ struct COOSymmetrizeInputs { }; template -class COOSymmetrizeTest - : public ::testing::TestWithParam> { +class COOSymmetrizeTest : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -144,7 +147,8 @@ class COOSymmetrizeTest const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef COOSymmetrizeTest COOSymmetrize; -TEST_P(COOSymmetrize, Result) { +TEST_P(COOSymmetrize, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); @@ -153,16 +157,14 @@ TEST_P(COOSymmetrize, Result) { int nnz = 8; - int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; - float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; + int* in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int* in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; + float* in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; - int *exp_rows_h = - new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; - int *exp_cols_h = - new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; - float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, - 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; + int* exp_rows_h = new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; + int* exp_cols_h = new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; + float* exp_vals_h = + new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; COO in(alloc, stream, nnz, 4, 4); raft::update_device(in.rows(), *&in_rows_h, nnz, stream); @@ -172,22 +174,19 @@ TEST_P(COOSymmetrize, Result) { COO out(alloc, stream); linalg::coo_symmetrize<32, float>( - &in, &out, - [] __device__(int row, int col, float val, float trans) { - return val + trans; - }, - alloc, stream); + &in, + &out, + [] __device__(int row, int col, float val, float trans) { return val + trans; }, + alloc, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); std::cout << out << std::endl; ASSERT_TRUE(out.nnz == nnz * 2); - ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, raft::Compare())); cudaStreamDestroy(stream); @@ -200,8 +199,7 @@ TEST_P(COOSymmetrize, Result) { delete[] exp_vals_h; } -INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, ::testing::ValuesIn(inputsf)); const std::vector> symm_inputs_fint = { // Test n_clusters == n_points @@ -221,7 +219,8 @@ const std::vector> symm_inputs_fint = { typedef SparseSymmetrizeTest SparseSymmetrizeTestF_int; TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, SparseSymmetrizeTestF_int, +INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, + SparseSymmetrizeTestF_int, ::testing::ValuesIn(symm_inputs_fint)); } // namespace sparse diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index def1f1685b..8d35960d6a 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -29,7 +29,8 @@ namespace knn { template class HaversineKNNTest : public ::testing::Test { protected: - void basicTest() { + void basicTest() + { auto alloc = std::make_shared(); // Allocate input @@ -44,31 +45,37 @@ class HaversineKNNTest : public ::testing::Test { raft::allocate(d_pred_D, n * n); // make testdata on host - std::vector h_train_inputs = { - 0.71113885, -1.29215058, 0.59613176, -2.08048115, - 0.74932804, -1.33634042, 0.51486728, -1.65962873, - 0.53154002, -1.47049808, 0.72891737, -1.54095137}; + std::vector h_train_inputs = {0.71113885, + -1.29215058, + 0.59613176, + -2.08048115, + 0.74932804, + -1.33634042, + 0.51486728, + -1.65962873, + 0.53154002, + -1.47049808, + 0.72891737, + -1.54095137}; h_train_inputs.resize(n); raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0); - std::vector h_res_D = { - 0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, - 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, - 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, - 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, - 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, - 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; + std::vector h_res_D = {0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, + 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, + 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, + 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, + 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, + 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; h_res_D.resize(n * n); raft::update_device(d_ref_D, h_res_D.data(), n * n, 0); - std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, - 2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1, - 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; + std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1, + 3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; h_res_I.resize(n * n); raft::update_device(d_ref_I, h_res_I.data(), n * n, 0); - std::vector input_vec = {d_train_inputs}; + std::vector input_vec = {d_train_inputs}; std::vector sizes_vec = {n}; cudaStream_t stream; @@ -82,7 +89,8 @@ class HaversineKNNTest : public ::testing::Test { void SetUp() override { basicTest(); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(d_train_inputs)); CUDA_CHECK(cudaFree(d_pred_I)); CUDA_CHECK(cudaFree(d_pred_D)); @@ -91,27 +99,26 @@ class HaversineKNNTest : public ::testing::Test { } protected: - value_t *d_train_inputs; + value_t* d_train_inputs; int n = 6; int d = 2; int k = 6; - value_idx *d_pred_I; - value_t *d_pred_D; + value_idx* d_pred_I; + value_t* d_pred_D; - value_idx *d_ref_I; - value_t *d_ref_D; + value_idx* d_ref_I; + value_t* d_ref_D; }; typedef HaversineKNNTest HaversineKNNTestF; -TEST_F(HaversineKNNTestF, Fit) { - ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n, - raft::CompareApprox(1e-3))); - ASSERT_TRUE( - raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare())); +TEST_F(HaversineKNNTestF, Fit) +{ + ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n, raft::CompareApprox(1e-3))); + ASSERT_TRUE(raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare())); } } // namespace knn diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 2b1ef89f7a..d4e35c9d54 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -31,18 +31,18 @@ struct KNNInputs { std::vector labels; }; -__global__ void build_actual_output(int *output, int n_rows, int k, - const int *idx_labels, - const int64_t *indices) { +__global__ void build_actual_output( + int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices) +{ int element = threadIdx.x + blockDim.x * blockIdx.x; if (element >= n_rows * k) return; - int ind = (int)indices[element]; + int ind = (int)indices[element]; output[element] = idx_labels[ind]; } -__global__ void build_expected_output(int *output, int n_rows, int k, - const int *labels) { +__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels) +{ int row = threadIdx.x + blockDim.x * blockIdx.x; if (row >= n_rows) return; @@ -55,25 +55,33 @@ __global__ void build_expected_output(int *output, int n_rows, int k, template class KNNTest : public ::testing::TestWithParam { protected: - void testBruteForce() { - raft::print_device_vector("Input array: ", input_, rows_ * cols_, - std::cout); + void testBruteForce() + { + raft::print_device_vector("Input array: ", input_, rows_ * cols_, std::cout); std::cout << "K: " << k_ << "\n"; - raft::print_device_vector("Labels array: ", search_labels_, rows_, - std::cout); + raft::print_device_vector("Labels array: ", search_labels_, rows_, std::cout); auto stream = handle_.get_stream(); raft::allocate(actual_labels_, rows_ * k_, true); raft::allocate(expected_labels_, rows_ * k_, true); - std::vector input_vec; + std::vector input_vec; std::vector sizes_vec; input_vec.push_back(input_); sizes_vec.push_back(rows_); - brute_force_knn(handle_, input_vec, sizes_vec, cols_, search_data_, rows_, - indices_, distances_, k_, true, true); + brute_force_knn(handle_, + input_vec, + sizes_vec, + cols_, + search_data_, + rows_, + indices_, + distances_, + k_, + true, + true); build_actual_output<<>>( actual_labels_, rows_, k_, search_labels_, indices_); @@ -81,24 +89,20 @@ class KNNTest : public ::testing::TestWithParam { build_expected_output<<>>( expected_labels_, rows_, k_, search_labels_); - raft::print_device_vector("Output indices: ", indices_, rows_ * k_, - std::cout); - raft::print_device_vector("Output distances: ", distances_, rows_ * k_, - std::cout); - raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_, - std::cout); - raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, - std::cout); - - ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_, - raft::Compare())); + raft::print_device_vector("Output indices: ", indices_, rows_ * k_, std::cout); + raft::print_device_vector("Output distances: ", distances_, rows_ * k_, std::cout); + raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_, std::cout); + raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, std::cout); + + ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_, raft::Compare())); } - void SetUp() override { + void SetUp() override + { params_ = ::testing::TestWithParam::GetParam(); - rows_ = params_.input.size(); - cols_ = params_.input[0].size(); - k_ = params_.k; + rows_ = params_.input.size(); + cols_ = params_.input[0].size(); + k_ = params_.k; std::vector row_major_input; for (int i = 0; i < params_.input.size(); ++i) { @@ -107,14 +111,12 @@ class KNNTest : public ::testing::TestWithParam { } } rmm::device_buffer input_d = rmm::device_buffer( - row_major_input.data(), row_major_input.size() * sizeof(float), - handle_.get_stream()); - float *input_ptr = static_cast(input_d.data()); + row_major_input.data(), row_major_input.size() * sizeof(float), handle_.get_stream()); + float* input_ptr = static_cast(input_d.data()); rmm::device_buffer labels_d = rmm::device_buffer( - params_.labels.data(), params_.labels.size() * sizeof(int), - handle_.get_stream()); - int *labels_ptr = static_cast(labels_d.data()); + params_.labels.data(), params_.labels.size() * sizeof(int), handle_.get_stream()); + int* labels_ptr = static_cast(labels_d.data()); raft::allocate(input_, rows_ * cols_, true); raft::allocate(search_data_, rows_ * cols_, true); @@ -127,7 +129,8 @@ class KNNTest : public ::testing::TestWithParam { raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream()); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(search_data_)); CUDA_CHECK(cudaFree(indices_)); CUDA_CHECK(cudaFree(distances_)); @@ -139,15 +142,15 @@ class KNNTest : public ::testing::TestWithParam { KNNInputs params_; int rows_; int cols_; - float *input_; - float *search_data_; - int64_t *indices_; - float *distances_; + float* input_; + float* search_data_; + int64_t* indices_; + float* distances_; int k_; - int *search_labels_; - int *actual_labels_; - int *expected_labels_; + int* search_labels_; + int* actual_labels_; + int* expected_labels_; }; const std::vector inputs = { diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index e5c2d52764..2d7d713717 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -32,7 +32,8 @@ struct csr_view_t { index_type number_of_edges; }; } // namespace -TEST(Raft, SpectralMatrices) { +TEST(Raft, SpectralMatrices) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -49,19 +50,18 @@ TEST(Raft, SpectralMatrices) { index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; sparse_matrix_t sm2{h, csr_v}; ASSERT_EQ(nullptr, sm1.row_offsets_); ASSERT_EQ(nullptr, sm2.row_offsets_); - auto stream = h.get_stream(); + auto stream = h.get_stream(); auto t_exe_pol = thrust::cuda::par.on(stream); auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args @@ -71,8 +71,7 @@ TEST(Raft, SpectralMatrices) { EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - modularity_matrix_t mm1{h, t_exe_pol, ro, ci, - vs, nrows, nnz}; + modularity_matrix_t mm1{h, t_exe_pol, ro, ci, vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 4a3b0ed196..8eb2f91952 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -35,14 +35,16 @@ struct MeanInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MeanInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MeanInputs& dims) +{ return os; } template class MeanTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -59,13 +61,15 @@ class MeanTest : public ::testing::TestWithParam> { meanSGtest(data, stream); } - void meanSGtest(T *data, cudaStream_t stream) { + void meanSGtest(T* data, cudaStream_t stream) + { int rows = params.rows, cols = params.cols; mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(mean_act)); } @@ -78,52 +82,52 @@ class MeanTest : public ::testing::TestWithParam> { // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the // measured mean (of a normal distribution) will fall outside of an epsilon of // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times) -const std::vector> inputsf = { - {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, - {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; - -const std::vector> inputsd = { - {0.15, 1.0, 1024, 32, true, false, 1234ULL}, - {0.15, 1.0, 1024, 64, true, false, 1234ULL}, - {0.15, 1.0, 1024, 128, true, false, 1234ULL}, - {0.15, 1.0, 1024, 256, true, false, 1234ULL}, - {0.15, -1.0, 1024, 32, false, false, 1234ULL}, - {0.15, -1.0, 1024, 64, false, false, 1234ULL}, - {0.15, -1.0, 1024, 128, false, false, 1234ULL}, - {0.15, -1.0, 1024, 256, false, false, 1234ULL}, - {0.15, 1.0, 1024, 32, true, true, 1234ULL}, - {0.15, 1.0, 1024, 64, true, true, 1234ULL}, - {0.15, 1.0, 1024, 128, true, true, 1234ULL}, - {0.15, 1.0, 1024, 256, true, true, 1234ULL}, - {0.15, -1.0, 1024, 32, false, true, 1234ULL}, - {0.15, -1.0, 1024, 64, false, true, 1234ULL}, - {0.15, -1.0, 1024, 128, false, true, 1234ULL}, - {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; +const std::vector> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, + {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; + +const std::vector> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL}, + {0.15, 1.0, 1024, 64, true, false, 1234ULL}, + {0.15, 1.0, 1024, 128, true, false, 1234ULL}, + {0.15, 1.0, 1024, 256, true, false, 1234ULL}, + {0.15, -1.0, 1024, 32, false, false, 1234ULL}, + {0.15, -1.0, 1024, 64, false, false, 1234ULL}, + {0.15, -1.0, 1024, 128, false, false, 1234ULL}, + {0.15, -1.0, 1024, 256, false, false, 1234ULL}, + {0.15, 1.0, 1024, 32, true, true, 1234ULL}, + {0.15, 1.0, 1024, 64, true, true, 1234ULL}, + {0.15, 1.0, 1024, 128, true, true, 1234ULL}, + {0.15, 1.0, 1024, 256, true, true, 1234ULL}, + {0.15, -1.0, 1024, 32, false, true, 1234ULL}, + {0.15, -1.0, 1024, 64, false, true, 1234ULL}, + {0.15, -1.0, 1024, 128, false, true, 1234ULL}, + {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; typedef MeanTest MeanTestF; -TEST_P(MeanTestF, Result) { - ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols, - CompareApprox(params.tolerance))); +TEST_P(MeanTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(params.mean, mean_act, params.cols, CompareApprox(params.tolerance))); } typedef MeanTest MeanTestD; -TEST_P(MeanTestD, Result) { - ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols, - CompareApprox(params.tolerance))); +TEST_P(MeanTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(params.mean, mean_act, params.cols, CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index 8b0d607561..67df0def05 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -34,16 +34,16 @@ struct MeanCenterInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const MeanCenterInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs& dims) +{ return os; } template -class MeanCenterTest - : public ::testing::TestWithParam> { +class MeanCenterTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -51,7 +51,7 @@ class MeanCenterTest CUDA_CHECK(cudaStreamCreate(&stream)); auto rows = params.rows, cols = params.cols; - auto len = rows * cols; + auto len = rows * cols; IdxType vecLen = params.bcastAlongRows ? cols : rows; raft::allocate(out, len); @@ -59,16 +59,15 @@ class MeanCenterTest raft::allocate(data, len); raft::allocate(meanVec, vecLen); r.normal(data, len, params.mean, (T)1.0, stream); - raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, - stream); - meanCenter(out, data, meanVec, cols, rows, params.rowMajor, - params.bcastAlongRows, stream); - raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows, - params.rowMajor, params.bcastAlongRows, (T)-1.0); + raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, stream); + meanCenter(out, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, stream); + raft::linalg::naiveMatVec( + out_ref, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, (T)-1.0); CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(out)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(data)); @@ -106,12 +105,11 @@ const std::vector> inputsf_i32 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i32; -TEST_P(MeanCenterTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL}, @@ -139,12 +137,11 @@ const std::vector> inputsf_i64 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i64; -TEST_P(MeanCenterTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -172,12 +169,12 @@ const std::vector> inputsd_i32 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i32; -TEST_P(MeanCenterTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -205,12 +202,12 @@ const std::vector> inputsd_i64 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i64; -TEST_P(MeanCenterTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index ff2698788f..8b7f75171b 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -34,14 +34,16 @@ struct StdDevInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const StdDevInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const StdDevInputs& dims) +{ return os; } template class StdDevTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); random::Rng r(params.seed); int rows = params.rows, cols = params.cols; @@ -58,21 +60,21 @@ class StdDevTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void stdVarSGtest(T *data, cudaStream_t stream) { + void stdVarSGtest(T* data, cudaStream_t stream) + { int rows = params.rows, cols = params.cols; mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream); - stddev(stddev_act, data, mean_act, cols, rows, params.sample, - params.rowMajor, stream); + stddev(stddev_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream); - vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor, - stream); + vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream); raft::matrix::seqRoot(vars_act, T(1), cols, stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(mean_act)); CUDA_CHECK(cudaFree(stddev_act)); @@ -121,28 +123,28 @@ const std::vector> inputsd = { {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}}; typedef StdDevTest StdDevTestF; -TEST_P(StdDevTestF, Result) { - ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols, - CompareApprox(params.tolerance))); +TEST_P(StdDevTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(stddev_act, vars_act, params.cols, CompareApprox(params.tolerance))); } typedef StdDevTest StdDevTestD; -TEST_P(StdDevTestD, Result) { - ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols, - CompareApprox(params.tolerance))); +TEST_P(StdDevTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(stddev_act, vars_act, params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, ::testing::ValuesIn(inputsd)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index c3140d4588..89e81708cc 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -32,15 +32,17 @@ struct SumInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SumInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SumInputs& dims) +{ return os; } template class SumTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); int rows = params.rows, cols = params.cols; int len = rows * cols; cudaStream_t stream; @@ -59,7 +61,8 @@ class SumTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(sum_act)); } @@ -76,15 +79,17 @@ const std::vector> inputsd = {{0.05, 1024, 32, 1234ULL}, {0.05, 1024, 256, 1234ULL}}; typedef SumTest SumTestF; -TEST_P(SumTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(SumTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + float(params.rows), sum_act, params.cols, raft::CompareApprox(params.tolerance))); } typedef SumTest SumTestD; -TEST_P(SumTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act, params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(SumTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + double(params.rows), sum_act, params.cols, raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h index b8e8fe3fa0..ca09d9c855 100644 --- a/cpp/test/test_utils.h +++ b/cpp/test/test_utils.h @@ -25,15 +25,16 @@ namespace raft { template struct Compare { - bool operator()(const T &a, const T &b) const { return a == b; } + bool operator()(const T& a, const T& b) const { return a == b; } }; template struct CompareApprox { CompareApprox(T eps_) : eps(eps_) {} - bool operator()(const T &a, const T &b) const { - T diff = abs(a - b); - T m = std::max(abs(a), abs(b)); + bool operator()(const T& a, const T& b) const + { + T diff = abs(a - b); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); @@ -46,9 +47,10 @@ struct CompareApprox { template struct CompareApproxAbs { CompareApproxAbs(T eps_) : eps(eps_) {} - bool operator()(const T &a, const T &b) const { - T diff = abs(abs(a) - abs(b)); - T m = std::max(abs(a), abs(b)); + bool operator()(const T& a, const T& b) const + { + T diff = abs(abs(a) - abs(b)); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); } @@ -58,25 +60,26 @@ struct CompareApproxAbs { }; template -T abs(const T &a) { +T abs(const T& a) +{ return a > T(0) ? a : -a; } /* - * @brief Helper function to compare 2 device n-D arrays with custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value(s) - * @param actual actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - * @{ - */ + * @brief Helper function to compare 2 device n-D arrays with custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value(s) + * @param actual actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + * @{ + */ template -testing::AssertionResult devArrMatch(const T *expected, const T *actual, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); raft::update_host(exp_h.get(), expected, size, stream); @@ -86,16 +89,16 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, auto exp = exp_h.get()[i]; auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { - return testing::AssertionFailure() - << "actual=" << act << " != expected=" << exp << " @" << i; + return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i; } } return testing::AssertionSuccess(); } template -testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, - L eq_compare, cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -110,9 +113,13 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, } template -testing::AssertionResult devArrMatch(const T *expected, const T *actual, - size_t rows, size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch(const T* expected, + const T* actual, + size_t rows, + size_t cols, + L eq_compare, + cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); @@ -126,8 +133,7 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, auto act = act_h.get()[idx]; if (!eq_compare(exp, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << exp << " @" << i << "," - << j; + << "actual=" << act << " != expected=" << exp << " @" << i << "," << j; } } } @@ -135,9 +141,9 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, } template -testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, - size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -148,8 +154,7 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i - << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; } } } @@ -157,24 +162,24 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, } /* - * @brief Helper function to compare a device n-D arrays with an expected array - * on the host, using a custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected_h host array of expected value(s) - * @param actual_d device array actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare a device n-D arrays with an expected array + * on the host, using a custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected_h host array of expected value(s) + * @param actual_d device array actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatchHost( + const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual_d, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - bool ok = true; + bool ok = true; auto fail = testing::AssertionFailure(); for (size_t i(0); i < size; ++i) { auto exp = expected_h[i]; @@ -189,19 +194,19 @@ testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, } /* - * @brief Helper function to compare diagonal values of a 2D matrix - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value along diagonal - * @param actual actual matrix - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare diagonal values of a 2D matrix + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value along diagonal + * @param actual actual matrix + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, - size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult diagonalMatch( + T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -213,8 +218,7 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i - << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; } } } @@ -222,10 +226,10 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, } template -testing::AssertionResult match(const T expected, T actual, L eq_compare) { +testing::AssertionResult match(const T expected, T actual, L eq_compare) +{ if (!eq_compare(expected, actual)) { - return testing::AssertionFailure() - << "actual=" << actual << " != expected=" << expected; + return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected; } return testing::AssertionSuccess(); } From 31bf93e7b2ff1f64ed16f31717453586487856f1 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Wed, 24 Nov 2021 17:58:22 -0500 Subject: [PATCH 3/5] Revert "Formatting changes" This reverts commit cc03dbac0da3a25b51404fec2526c43812982be7. --- cpp/include/raft.hpp | 3 +- cpp/include/raft/cache/cache_util.cuh | 104 +- cpp/include/raft/common/cub_wrappers.cuh | 42 +- .../raft/common/device_loads_stores.cuh | 87 +- cpp/include/raft/common/scatter.cuh | 77 +- cpp/include/raft/comms/comms.hpp | 342 ++-- cpp/include/raft/comms/helper.hpp | 37 +- cpp/include/raft/comms/mpi_comms.hpp | 300 ++-- cpp/include/raft/comms/std_comms.hpp | 328 ++-- cpp/include/raft/comms/test.hpp | 236 +-- cpp/include/raft/comms/ucp_helper.hpp | 138 +- cpp/include/raft/comms/util.hpp | 114 +- cpp/include/raft/cuda_utils.cuh | 259 +-- cpp/include/raft/cudart_utils.h | 190 +- cpp/include/raft/device_atomics.cuh | 265 +-- cpp/include/raft/distance/canberra.cuh | 136 +- cpp/include/raft/distance/chebyshev.cuh | 136 +- cpp/include/raft/distance/cosine.cuh | 175 +- cpp/include/raft/distance/distance.cuh | 520 ++---- cpp/include/raft/distance/euclidean.cuh | 314 ++-- cpp/include/raft/distance/fused_l2_nn.cuh | 254 +-- cpp/include/raft/distance/hellinger.cuh | 154 +- cpp/include/raft/distance/l1.cuh | 128 +- cpp/include/raft/distance/minkowski.cuh | 139 +- .../raft/distance/pairwise_distance_base.cuh | 159 +- cpp/include/raft/error.hpp | 50 +- cpp/include/raft/handle.hpp | 121 +- cpp/include/raft/integer_utils.h | 55 +- cpp/include/raft/label/classlabels.cuh | 137 +- cpp/include/raft/label/merge_labels.cuh | 31 +- cpp/include/raft/lap/d_structs.h | 20 +- cpp/include/raft/lap/lap.cuh | 161 +- cpp/include/raft/lap/lap_functions.cuh | 399 ++--- cpp/include/raft/lap/lap_kernels.cuh | 343 ++-- cpp/include/raft/linalg/add.cuh | 35 +- cpp/include/raft/linalg/binary_op.cuh | 61 +- .../raft/linalg/cholesky_r1_update.cuh | 63 +- .../raft/linalg/coalesced_reduction.cuh | 55 +- cpp/include/raft/linalg/contractions.cuh | 76 +- cpp/include/raft/linalg/cublas_wrappers.h | 921 +++------- cpp/include/raft/linalg/cusolver_wrappers.h | 1144 +++--------- cpp/include/raft/linalg/divide.cuh | 7 +- cpp/include/raft/linalg/eig.cuh | 169 +- cpp/include/raft/linalg/eltwise.cuh | 56 +- cpp/include/raft/linalg/gemm.cuh | 85 +- cpp/include/raft/linalg/gemv.h | 54 +- cpp/include/raft/linalg/init.h | 6 +- cpp/include/raft/linalg/lanczos.hpp | 786 +++----- cpp/include/raft/linalg/map.cuh | 31 +- cpp/include/raft/linalg/map_then_reduce.cuh | 92 +- cpp/include/raft/linalg/matrix_vector_op.cuh | 102 +- .../raft/linalg/mean_squared_error.cuh | 10 +- cpp/include/raft/linalg/multiply.cuh | 7 +- cpp/include/raft/linalg/norm.cuh | 92 +- cpp/include/raft/linalg/qr.cuh | 87 +- cpp/include/raft/linalg/reduce.cuh | 37 +- cpp/include/raft/linalg/strided_reduction.cuh | 74 +- cpp/include/raft/linalg/subtract.cuh | 34 +- cpp/include/raft/linalg/svd.cuh | 238 +-- cpp/include/raft/linalg/transpose.h | 61 +- cpp/include/raft/linalg/unary_op.cuh | 86 +- cpp/include/raft/matrix/math.cuh | 286 +-- cpp/include/raft/matrix/matrix.cuh | 208 +-- cpp/include/raft/mr/buffer_base.hpp | 59 +- cpp/include/raft/mr/device/allocator.hpp | 9 +- cpp/include/raft/mr/device/buffer.hpp | 14 +- cpp/include/raft/mr/host/allocator.hpp | 13 +- cpp/include/raft/mr/host/buffer.hpp | 21 +- cpp/include/raft/random/rng.cuh | 319 ++-- cpp/include/raft/random/rng_impl.cuh | 89 +- cpp/include/raft/sparse/convert/coo.cuh | 20 +- cpp/include/raft/sparse/convert/csr.cuh | 126 +- cpp/include/raft/sparse/convert/dense.cuh | 35 +- cpp/include/raft/sparse/coo.cuh | 192 +- cpp/include/raft/sparse/csr.cuh | 131 +- cpp/include/raft/sparse/cusparse_wrappers.h | 1590 +++++------------ .../raft/sparse/distance/bin_distance.cuh | 189 +- cpp/include/raft/sparse/distance/common.h | 18 +- cpp/include/raft/sparse/distance/coo_spmv.cuh | 118 +- .../coo_spmv_strategies/base_strategy.cuh | 138 +- .../coo_mask_row_iterators.cuh | 166 +- .../dense_smem_strategy.cuh | 104 +- .../coo_spmv_strategies/hash_strategy.cuh | 277 ++- .../distance/detail/coo_spmv_kernel.cuh | 196 +- cpp/include/raft/sparse/distance/distance.cuh | 48 +- .../raft/sparse/distance/ip_distance.cuh | 27 +- .../raft/sparse/distance/l2_distance.cuh | 386 ++-- .../raft/sparse/distance/lp_distance.cuh | 199 +-- .../raft/sparse/distance/operators.cuh | 29 +- cpp/include/raft/sparse/distance/utils.cuh | 6 +- cpp/include/raft/sparse/hierarchy/common.h | 10 +- .../sparse/hierarchy/detail/agglomerative.cuh | 124 +- .../hierarchy/detail/connectivities.cuh | 92 +- .../raft/sparse/hierarchy/detail/mst.cuh | 93 +- .../raft/sparse/hierarchy/single_linkage.hpp | 66 +- cpp/include/raft/sparse/linalg/add.cuh | 116 +- cpp/include/raft/sparse/linalg/degree.cuh | 56 +- cpp/include/raft/sparse/linalg/norm.cuh | 51 +- cpp/include/raft/sparse/linalg/spectral.cuh | 72 +- cpp/include/raft/sparse/linalg/symmetrize.cuh | 157 +- cpp/include/raft/sparse/linalg/transpose.h | 56 +- .../raft/sparse/mst/detail/mst_kernels.cuh | 160 +- .../raft/sparse/mst/detail/mst_solver_inl.cuh | 258 ++- cpp/include/raft/sparse/mst/detail/utils.cuh | 19 +- cpp/include/raft/sparse/mst/mst.cuh | 34 +- cpp/include/raft/sparse/mst/mst_solver.cuh | 48 +- cpp/include/raft/sparse/op/filter.cuh | 115 +- cpp/include/raft/sparse/op/reduce.cuh | 55 +- cpp/include/raft/sparse/op/row_op.cuh | 16 +- cpp/include/raft/sparse/op/slice.h | 34 +- cpp/include/raft/sparse/op/sort.h | 35 +- .../sparse/selection/connect_components.cuh | 224 +-- cpp/include/raft/sparse/selection/knn.cuh | 444 ++--- .../raft/sparse/selection/knn_graph.cuh | 54 +- .../raft/sparse/selection/selection.cuh | 99 +- cpp/include/raft/sparse/utils.h | 22 +- cpp/include/raft/spatial/knn/ann.hpp | 31 +- cpp/include/raft/spatial/knn/ann_common.h | 10 +- .../knn/detail/ann_quantized_faiss.cuh | 141 +- .../raft/spatial/knn/detail/common_faiss.h | 37 +- .../spatial/knn/detail/haversine_distance.cuh | 56 +- .../knn/detail/knn_brute_force_faiss.cuh | 178 +- .../raft/spatial/knn/detail/processing.hpp | 134 +- cpp/include/raft/spatial/knn/knn.hpp | 64 +- cpp/include/raft/spectral/cluster_solvers.hpp | 39 +- cpp/include/raft/spectral/eigen_solvers.hpp | 66 +- cpp/include/raft/spectral/kmeans.hpp | 476 ++--- cpp/include/raft/spectral/lapack.hpp | 552 ++---- cpp/include/raft/spectral/matrix_wrappers.hpp | 279 ++- .../raft/spectral/modularity_maximization.hpp | 52 +- cpp/include/raft/spectral/partition.hpp | 61 +- cpp/include/raft/spectral/spectral_util.hpp | 125 +- cpp/include/raft/spectral/warn_dbg.hpp | 4 +- cpp/include/raft/stats/mean.cuh | 42 +- cpp/include/raft/stats/mean_center.cuh | 45 +- cpp/include/raft/stats/stddev.cuh | 102 +- cpp/include/raft/stats/sum.cuh | 38 +- cpp/include/raft/vectorized.cuh | 112 +- cpp/test/cluster_solvers.cu | 22 +- cpp/test/cudart_utils.cpp | 3 +- cpp/test/distance/dist_adj.cu | 78 +- cpp/test/distance/dist_canberra.cu | 24 +- cpp/test/distance/dist_chebyshev.cu | 24 +- cpp/test/distance/dist_cos.cu | 23 +- cpp/test/distance/dist_euc_exp.cu | 22 +- cpp/test/distance/dist_euc_unexp.cu | 18 +- cpp/test/distance/dist_hellinger.cu | 24 +- cpp/test/distance/dist_l1.cu | 24 +- cpp/test/distance/dist_minkowski.cu | 23 +- cpp/test/distance/distance_base.cuh | 203 +-- cpp/test/distance/fused_l2_nn.cu | 192 +- cpp/test/eigen_solvers.cu | 35 +- cpp/test/handle.cpp | 21 +- cpp/test/integer_utils.cpp | 6 +- cpp/test/label/label.cu | 31 +- cpp/test/label/merge_labels.cu | 67 +- cpp/test/lap/lap.cu | 92 +- cpp/test/linalg/add.cu | 13 +- cpp/test/linalg/add.cuh | 17 +- cpp/test/linalg/binary_op.cu | 88 +- cpp/test/linalg/binary_op.cuh | 17 +- cpp/test/linalg/cholesky_r1.cu | 50 +- cpp/test/linalg/coalesced_reduction.cu | 60 +- cpp/test/linalg/divide.cu | 50 +- cpp/test/linalg/eig.cu | 177 +- cpp/test/linalg/eig_sel.cu | 92 +- cpp/test/linalg/eltwise.cu | 98 +- cpp/test/linalg/gemm_layout.cu | 63 +- cpp/test/linalg/map.cu | 98 +- cpp/test/linalg/map_then_reduce.cu | 99 +- cpp/test/linalg/matrix_vector_op.cu | 109 +- cpp/test/linalg/matrix_vector_op.cuh | 73 +- cpp/test/linalg/multiply.cu | 30 +- cpp/test/linalg/norm.cu | 140 +- cpp/test/linalg/reduce.cu | 84 +- cpp/test/linalg/reduce.cuh | 59 +- cpp/test/linalg/strided_reduction.cu | 61 +- cpp/test/linalg/subtract.cu | 74 +- cpp/test/linalg/svd.cu | 108 +- cpp/test/linalg/transpose.cu | 51 +- cpp/test/linalg/unary_op.cu | 46 +- cpp/test/linalg/unary_op.cuh | 17 +- cpp/test/matrix/math.cu | 194 +- cpp/test/matrix/matrix.cu | 84 +- cpp/test/mr/device/buffer.cpp | 16 +- cpp/test/mr/host/buffer.cpp | 9 +- cpp/test/mst.cu | 172 +- cpp/test/random/rng.cu | 203 ++- cpp/test/random/rng_int.cu | 66 +- cpp/test/random/sample_without_replacement.cu | 35 +- cpp/test/sparse/add.cu | 97 +- cpp/test/sparse/connect_components.cu | 599 ++++--- cpp/test/sparse/convert_coo.cu | 20 +- cpp/test/sparse/convert_csr.cu | 50 +- cpp/test/sparse/csr_row_slice.cu | 80 +- cpp/test/sparse/csr_to_dense.cu | 63 +- cpp/test/sparse/csr_transpose.cu | 80 +- cpp/test/sparse/degree.cu | 23 +- cpp/test/sparse/dist_coo_spmv.cu | 936 +++++----- cpp/test/sparse/distance.cu | 248 +-- cpp/test/sparse/filter.cu | 33 +- cpp/test/sparse/knn.cu | 91 +- cpp/test/sparse/knn_graph.cu | 36 +- cpp/test/sparse/linkage.cu | 647 ++++--- cpp/test/sparse/norm.cu | 34 +- cpp/test/sparse/reduce.cu | 50 +- cpp/test/sparse/row_op.cu | 40 +- cpp/test/sparse/selection.cu | 59 +- cpp/test/sparse/sort.cu | 22 +- cpp/test/sparse/symmetrize.cu | 89 +- cpp/test/spatial/haversine.cu | 61 +- cpp/test/spatial/knn.cu | 89 +- cpp/test/spectral_matrix.cu | 13 +- cpp/test/stats/mean.cu | 94 +- cpp/test/stats/mean_center.cu | 63 +- cpp/test/stats/stddev.cu | 46 +- cpp/test/stats/sum.cu | 25 +- cpp/test/test_utils.h | 136 +- 218 files changed, 11470 insertions(+), 16429 deletions(-) diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp index 08f836d3a8..f380d276b2 100644 --- a/cpp/include/raft.hpp +++ b/cpp/include/raft.hpp @@ -21,8 +21,7 @@ namespace raft { /* Function for testing RAFT include * * @return message indicating RAFT has been included succesfully*/ -inline std::string test_raft() -{ +inline std::string test_raft() { std::string status = "RAFT Setup succesfully"; return status; } diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh index f63040fa00..ce8ef9a095 100644 --- a/cpp/include/raft/cache/cache_util.cuh +++ b/cpp/include/raft/cache/cache_util.cuh @@ -42,15 +42,17 @@ namespace cache { * @param [out] out vectors collected from the cache, size [n_vec * n] */ template -__global__ void get_vecs(const math_t* cache, int n_vec, const int* cache_idx, int n, math_t* out) -{ +__global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx, + int n, math_t *out) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - size_t out_col = tid / n_vec; // col idx + size_t out_col = tid / n_vec; // col idx size_t cache_col = cache_idx[out_col]; if (cache_idx[out_col] >= 0) { - if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; } + if (row + out_col * n_vec < (size_t)n_vec * n) { + out[tid] = cache[row + cache_col * n_vec]; + } } } } @@ -82,26 +84,21 @@ __global__ void get_vecs(const math_t* cache, int n_vec, const int* cache_idx, i * @param [in] n_cache_vecs */ template -__global__ void store_vecs(const math_t* tile, - int n_tile, - int n_vec, - const int* tile_idx, - int n, - const int* cache_idx, - math_t* cache, - int n_cache_vecs) -{ +__global__ void store_vecs(const math_t *tile, int n_tile, int n_vec, + const int *tile_idx, int n, const int *cache_idx, + math_t *cache, int n_cache_vecs) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - int tile_col = tid / n_vec; // col idx - int data_col = tile_idx ? tile_idx[tile_col] : tile_col; + int tile_col = tid / n_vec; // col idx + int data_col = tile_idx ? tile_idx[tile_col] : tile_col; int cache_col = cache_idx[tile_col]; // We ignore negative values. The rest of the checks should be fulfilled // if the cache is used properly if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) { - cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec]; + cache[row + (size_t)cache_col * n_vec] = + tile[row + (size_t)data_col * n_vec]; } } } @@ -124,15 +121,14 @@ int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; } * @return the index of the first element in the array for which * array[idx] >= value. If there is no such value, then return n. */ -int DI arg_first_ge(const int* array, int n, int val) -{ +int DI arg_first_ge(const int *array, int n, int val) { int start = 0; - int end = n - 1; + int end = n - 1; if (array[0] == val) return 0; if (array[end] < val) return n; while (start + 1 < end) { int q = (start + end + 1) / 2; - // invariants: + //invariants: // start < end // start < q <=end // array[start] < val && array[end] <=val @@ -161,8 +157,7 @@ int DI arg_first_ge(const int* array, int n, int val) * @return the idx of the k-th occurance of val in array, or -1 if * the value is not found. */ -int DI find_nth_occurrence(const int* array, int n, int val, int k) -{ +int DI find_nth_occurrence(const int *array, int n, int val, int k) { int q = arg_first_ge(array, n, val); if (q + k < n && array[q + k] == val) { q += k; @@ -201,10 +196,10 @@ int DI find_nth_occurrence(const int* array, int n, int val, int k) * Each block should give a different pointer for rank. */ template -DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank) -{ +DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { const int items_per_thread = raft::ceildiv(associativity, nthreads); - typedef cub::BlockRadixSort BlockRadixSort; + typedef cub::BlockRadixSort + BlockRadixSort; __shared__ typename BlockRadixSort::TempStorage temp_storage; int key[items_per_thread]; @@ -213,8 +208,8 @@ DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank) int block_offset = blockIdx.x * associativity; for (int j = 0; j < items_per_thread; j++) { - int k = threadIdx.x + j * nthreads; - int t = (k < associativity) ? cache_time[block_offset + k] : 32768; + int k = threadIdx.x + j * nthreads; + int t = (k < associativity) ? cache_time[block_offset + k] : 32768; key[j] = t; val[j] = k; } @@ -222,7 +217,9 @@ DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank) BlockRadixSort(temp_storage).Sort(key, val); for (int j = 0; j < items_per_thread; j++) { - if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; } + if (val[j] < associativity) { + rank[val[j]] = threadIdx.x * items_per_thread + j; + } } __syncthreads(); } @@ -255,15 +252,9 @@ DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank) * not be cached, size [n] */ template -__global__ void assign_cache_idx(const int* keys, - int n, - const int* cache_set, - int* cached_keys, - int n_cache_sets, - int* cache_time, - int time, - int* cache_idx) -{ +__global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, + int *cached_keys, int n_cache_sets, + int *cache_time, int time, int *cache_idx) { int block_offset = blockIdx.x * associativity; const int items_per_thread = raft::ceildiv(associativity, nthreads); @@ -282,7 +273,7 @@ __global__ void assign_cache_idx(const int* keys, // these elements are assigned -1. for (int j = 0; j < items_per_thread; j++) { - int i = threadIdx.x + j * nthreads; + int i = threadIdx.x + j * nthreads; int t_idx = block_offset + i; bool mask = (i < associativity); // whether this slot is available for writing @@ -293,10 +284,10 @@ __global__ void assign_cache_idx(const int* keys, if (mask) { int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]); if (k > -1) { - int key_val = keys[k]; + int key_val = keys[k]; cached_keys[t_idx] = key_val; - cache_idx[k] = t_idx; - cache_time[t_idx] = time; + cache_idx[k] = t_idx; + cache_time[t_idx] = time; } } } @@ -324,28 +315,21 @@ namespace { * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity] * @param [in] n_cache_sets number of cache sets * @param [in] associativity number of keys in cache set - * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * - * associativity] + * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * associativity] * @param [out] cache_idx cache indices of the working set elements, size [n] * @param [out] is_cached whether the element is cached size[n] * @param [in] time iteration counter (used for time stamping) */ -__global__ void get_cache_idx(int* keys, - int n, - int* cached_keys, - int n_cache_sets, - int associativity, - int* cache_time, - int* cache_idx, - bool* is_cached, - int time) -{ +__global__ void get_cache_idx(int *keys, int n, int *cached_keys, + int n_cache_sets, int associativity, + int *cache_time, int *cache_idx, bool *is_cached, + int time) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n) { - int widx = keys[tid]; - int sidx = hash(widx, n_cache_sets); - int cidx = sidx * associativity; - int i = 0; + int widx = keys[tid]; + int sidx = hash(widx, n_cache_sets); + int cidx = sidx * associativity; + int i = 0; bool found = false; // search for empty spot and the least recently used spot while (i < associativity && !found) { @@ -354,9 +338,9 @@ __global__ void get_cache_idx(int* keys, } is_cached[tid] = found; if (found) { - cidx = cidx + i - 1; - cache_time[cidx] = time; // update time stamp - cache_idx[tid] = cidx; // exact cache idx + cidx = cidx + i - 1; + cache_time[cidx] = time; //update time stamp + cache_idx[tid] = cidx; //exact cache idx } else { cache_idx[tid] = sidx; // assign cache set } diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh index 4767c7f254..8d5b29f700 100644 --- a/cpp/include/raft/common/cub_wrappers.cuh +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -22,32 +22,28 @@ namespace raft { /** - * @brief Convenience wrapper over cub's SortPairs method - * @tparam KeyT key type - * @tparam ValueT value type - * @param workspace workspace buffer which will get resized if not enough space - * @param inKeys input keys array - * @param outKeys output keys array - * @param inVals input values array - * @param outVals output values array - * @param len array length - * @param stream cuda stream - */ + * @brief Convenience wrapper over cub's SortPairs method + * @tparam KeyT key type + * @tparam ValueT value type + * @param workspace workspace buffer which will get resized if not enough space + * @param inKeys input keys array + * @param outKeys output keys array + * @param inVals input values array + * @param outVals output values array + * @param len array length + * @param stream cuda stream + */ template -void sortPairs(raft::mr::device::buffer& workspace, - const KeyT* inKeys, - KeyT* outKeys, - const ValueT* inVals, - ValueT* outVals, - int len, - cudaStream_t stream) -{ +void sortPairs(raft::mr::device::buffer &workspace, const KeyT *inKeys, + KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, + cudaStream_t stream) { size_t worksize; - cub::DeviceRadixSort::SortPairs( - nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); + cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals, + outVals, len, 0, sizeof(KeyT) * 8, stream); workspace.resize(worksize, stream); - cub::DeviceRadixSort::SortPairs( - workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); + cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys, + inVals, outVals, len, 0, sizeof(KeyT) * 8, + stream); } } // namespace raft diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh index 41dc9cab08..bb2b019ecb 100644 --- a/cpp/include/raft/common/device_loads_stores.cuh +++ b/cpp/include/raft/common/device_loads_stores.cuh @@ -31,43 +31,40 @@ namespace raft { * @param[out] addr shared memory address (should be aligned to vector size) * @param[in] x data to be stored at this address */ -DI void sts(float* addr, const float& x) -{ +DI void sts(float* addr, const float& x) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x)); } -DI void sts(float* addr, const float (&x)[1]) -{ +DI void sts(float* addr, const float (&x)[1]) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0])); } -DI void sts(float* addr, const float (&x)[2]) -{ +DI void sts(float* addr, const float (&x)[2]) { auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1])); + asm volatile("st.shared.v2.f32 [%0], {%1, %2};" + : + : "l"(s2), "f"(x[0]), "f"(x[1])); } -DI void sts(float* addr, const float (&x)[4]) -{ +DI void sts(float* addr, const float (&x)[4]) { auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};" : : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3])); } -DI void sts(double* addr, const double& x) -{ +DI void sts(double* addr, const double& x) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x)); } -DI void sts(double* addr, const double (&x)[1]) -{ +DI void sts(double* addr, const double (&x)[1]) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0])); } -DI void sts(double* addr, const double (&x)[2]) -{ +DI void sts(double* addr, const double (&x)[2]) { auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1])); + asm volatile("st.shared.v2.f64 [%0], {%1, %2};" + : + : "l"(s2), "d"(x[0]), "d"(x[1])); } /** @} */ @@ -83,42 +80,39 @@ DI void sts(double* addr, const double (&x)[2]) * @param[in] addr shared memory address from where to load * (should be aligned to vector size) */ -DI void lds(float& x, float* addr) -{ +DI void lds(float& x, float* addr) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1)); } -DI void lds(float (&x)[1], float* addr) -{ +DI void lds(float (&x)[1], float* addr) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1)); } -DI void lds(float (&x)[2], float* addr) -{ +DI void lds(float (&x)[2], float* addr) { auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2)); + asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" + : "=f"(x[0]), "=f"(x[1]) + : "l"(s2)); } -DI void lds(float (&x)[4], float* addr) -{ +DI void lds(float (&x)[4], float* addr) { auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(s4)); } -DI void lds(double& x, double* addr) -{ +DI void lds(double& x, double* addr) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1)); } -DI void lds(double (&x)[1], double* addr) -{ +DI void lds(double (&x)[1], double* addr) { auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1)); } -DI void lds(double (&x)[2], double* addr) -{ +DI void lds(double (&x)[2], double* addr) { auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2)); + asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" + : "=d"(x[0]), "=d"(x[1]) + : "l"(s2)); } /** @} */ @@ -129,35 +123,32 @@ DI void lds(double (&x)[2], double* addr) * @param[out] x data to be loaded from global memory * @param[in] addr address in global memory from where to load */ -DI void ldg(float& x, const float* addr) -{ +DI void ldg(float& x, const float* addr) { asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr)); } -DI void ldg(float (&x)[1], const float* addr) -{ +DI void ldg(float (&x)[1], const float* addr) { asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr)); } -DI void ldg(float (&x)[2], const float* addr) -{ - asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr)); +DI void ldg(float (&x)[2], const float* addr) { + asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" + : "=f"(x[0]), "=f"(x[1]) + : "l"(addr)); } -DI void ldg(float (&x)[4], const float* addr) -{ +DI void ldg(float (&x)[4], const float* addr) { asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(addr)); } -DI void ldg(double& x, const double* addr) -{ +DI void ldg(double& x, const double* addr) { asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr)); } -DI void ldg(double (&x)[1], const double* addr) -{ +DI void ldg(double (&x)[1], const double* addr) { asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr)); } -DI void ldg(double (&x)[2], const double* addr) -{ - asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr)); +DI void ldg(double (&x)[2], const double* addr) { + asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" + : "=d"(x[0]), "=d"(x[1]) + : "l"(addr)); } /** @} */ diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh index b228ac5499..785794461e 100644 --- a/cpp/include/raft/common/scatter.cuh +++ b/cpp/include/raft/common/scatter.cuh @@ -22,8 +22,8 @@ namespace raft { template -__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op) -{ +__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, + IdxT len, Lambda op) { typedef TxN_t DataVec; typedef TxN_t IdxVec; IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x); @@ -34,60 +34,61 @@ __global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT DataVec dataIn; #pragma unroll for (int i = 0; i < VecLen; ++i) { - auto inPos = idxIn.val.data[i]; + auto inPos = idxIn.val.data[i]; dataIn.val.data[i] = op(in[inPos], tid + i); } dataIn.store(out, tid); } template -void scatterImpl( - DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream) -{ +void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len, + Lambda op, cudaStream_t stream) { const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB); - scatterKernel<<>>(out, in, idx, len, op); + scatterKernel + <<>>(out, in, idx, len, op); CUDA_CHECK(cudaGetLastError()); } /** - * @brief Performs scatter operation based on the input indexing array - * @tparam DataT data type whose array gets scattered - * @tparam IdxT indexing type - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Lambda the device-lambda performing a unary operation on the loaded - * data before it gets scattered - * @param out the output array - * @param in the input array - * @param idx the indexing array - * @param len number of elements in the input array - * @param stream cuda stream where to launch work - * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This - * will be applied to every element before scattering it to the right location. - * The second param in this method will be the destination index. - */ -template , int TPB = 256> -void scatter(DataT* out, - const DataT* in, - const IdxT* idx, - IdxT len, - cudaStream_t stream, - Lambda op = raft::Nop()) -{ + * @brief Performs scatter operation based on the input indexing array + * @tparam DataT data type whose array gets scattered + * @tparam IdxT indexing type + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Lambda the device-lambda performing a unary operation on the loaded + * data before it gets scattered + * @param out the output array + * @param in the input array + * @param idx the indexing array + * @param len number of elements in the input array + * @param stream cuda stream where to launch work + * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This + * will be applied to every element before scattering it to the right location. + * The second param in this method will be the destination index. + */ +template , int TPB = 256> +void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len, + cudaStream_t stream, Lambda op = raft::Nop()) { if (len <= 0) return; - constexpr size_t DataSize = sizeof(DataT); - constexpr size_t IdxSize = sizeof(IdxT); + constexpr size_t DataSize = sizeof(DataT); + constexpr size_t IdxSize = sizeof(IdxT); constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize; - size_t bytes = len * MaxPerElem; + size_t bytes = len * MaxPerElem; if (16 / MaxPerElem && bytes % 16 == 0) { - scatterImpl(out, in, idx, len, op, stream); + scatterImpl(out, in, idx, len, + op, stream); } else if (8 / MaxPerElem && bytes % 8 == 0) { - scatterImpl(out, in, idx, len, op, stream); + scatterImpl(out, in, idx, len, op, + stream); } else if (4 / MaxPerElem && bytes % 4 == 0) { - scatterImpl(out, in, idx, len, op, stream); + scatterImpl(out, in, idx, len, op, + stream); } else if (2 / MaxPerElem && bytes % 2 == 0) { - scatterImpl(out, in, idx, len, op, stream); + scatterImpl(out, in, idx, len, op, + stream); } else if (1 / MaxPerElem) { - scatterImpl(out, in, idx, len, op, stream); + scatterImpl(out, in, idx, len, op, + stream); } else { scatterImpl(out, in, idx, len, op, stream); } diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index 72c3b3897e..dc172c9503 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -25,7 +25,16 @@ namespace raft { namespace comms { typedef unsigned int request_t; -enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 }; +enum class datatype_t { + CHAR, + UINT8, + INT32, + UINT32, + INT64, + UINT64, + FLOAT32, + FLOAT64 +}; enum class op_t { SUM, PROD, MIN, MAX }; /** @@ -41,50 +50,42 @@ template constexpr datatype_t get_type(); template <> -constexpr datatype_t get_type() -{ +constexpr datatype_t get_type() { return datatype_t::CHAR; } template <> -constexpr datatype_t get_type() -{ +constexpr datatype_t get_type() { return datatype_t::UINT8; } template <> -constexpr datatype_t get_type() -{ +constexpr datatype_t get_type() { return datatype_t::INT32; } template <> -constexpr datatype_t get_type() -{ +constexpr datatype_t get_type() { return datatype_t::UINT32; } template <> -constexpr datatype_t get_type() -{ +constexpr datatype_t get_type() { return datatype_t::INT64; } template <> -constexpr datatype_t get_type() -{ +constexpr datatype_t get_type() { return datatype_t::UINT64; } template <> -constexpr datatype_t get_type() -{ +constexpr datatype_t get_type() { return datatype_t::FLOAT32; } template <> -constexpr datatype_t get_type() -{ +constexpr datatype_t get_type() { return datatype_t::FLOAT64; } @@ -94,99 +95,72 @@ class comms_iface { virtual int get_rank() const = 0; virtual std::unique_ptr comm_split(int color, int key) const = 0; - virtual void barrier() const = 0; + virtual void barrier() const = 0; virtual status_t sync_stream(cudaStream_t stream) const = 0; - virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0; + virtual void isend(const void* buf, size_t size, int dest, int tag, + request_t* request) const = 0; - virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0; + virtual void irecv(void* buf, size_t size, int source, int tag, + request_t* request) const = 0; virtual void waitall(int count, request_t array_of_requests[]) const = 0; - virtual void allreduce(const void* sendbuff, - void* recvbuff, - size_t count, - datatype_t datatype, - op_t op, + virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count, + datatype_t datatype, op_t op, cudaStream_t stream) const = 0; - virtual void bcast( - void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0; + virtual void bcast(void* buff, size_t count, datatype_t datatype, int root, + cudaStream_t stream) const = 0; - virtual void reduce(const void* sendbuff, - void* recvbuff, - size_t count, - datatype_t datatype, - op_t op, - int root, + virtual void reduce(const void* sendbuff, void* recvbuff, size_t count, + datatype_t datatype, op_t op, int root, cudaStream_t stream) const = 0; - virtual void allgather(const void* sendbuff, - void* recvbuff, - size_t sendcount, - datatype_t datatype, - cudaStream_t stream) const = 0; + virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, + datatype_t datatype, cudaStream_t stream) const = 0; + + virtual void allgatherv(const void* sendbuf, void* recvbuf, + const size_t* recvcounts, const size_t* displs, + datatype_t datatype, cudaStream_t stream) const = 0; - virtual void allgatherv(const void* sendbuf, - void* recvbuf, - const size_t* recvcounts, - const size_t* displs, - datatype_t datatype, - cudaStream_t stream) const = 0; - - virtual void gather(const void* sendbuff, - void* recvbuff, - size_t sendcount, - datatype_t datatype, - int root, + virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount, + datatype_t datatype, int root, cudaStream_t stream) const = 0; - virtual void gatherv(const void* sendbuf, - void* recvbuf, - size_t sendcount, - const size_t* recvcounts, - const size_t* displs, - datatype_t datatype, - int root, + virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount, + const size_t* recvcounts, const size_t* displs, + datatype_t datatype, int root, cudaStream_t stream) const = 0; - virtual void reducescatter(const void* sendbuff, - void* recvbuff, - size_t recvcount, - datatype_t datatype, - op_t op, + virtual void reducescatter(const void* sendbuff, void* recvbuff, + size_t recvcount, datatype_t datatype, op_t op, cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0; + virtual void device_send(const void* buf, size_t size, int dest, + cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0; - - virtual void device_sendrecv(const void* sendbuf, - size_t sendsize, - int dest, - void* recvbuf, - size_t recvsize, - int source, + virtual void device_recv(void* buf, size_t size, int source, + cudaStream_t stream) const = 0; + + virtual void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, + void* recvbuf, size_t recvsize, int source, cudaStream_t stream) const = 0; - virtual void device_multicast_sendrecv(const void* sendbuf, - std::vector const& sendsizes, - std::vector const& sendoffsets, - std::vector const& dests, - void* recvbuf, - std::vector const& recvsizes, - std::vector const& recvoffsets, - std::vector const& sources, - cudaStream_t stream) const = 0; + virtual void device_multicast_sendrecv( + const void* sendbuf, std::vector const& sendsizes, + std::vector const& sendoffsets, std::vector const& dests, + void* recvbuf, std::vector const& recvsizes, + std::vector const& recvoffsets, std::vector const& sources, + cudaStream_t stream) const = 0; }; class comms_t { public: - comms_t(std::unique_ptr impl) : impl_(impl.release()) - { + comms_t(std::unique_ptr impl) : impl_(impl.release()) { ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!"); } @@ -213,8 +187,7 @@ class comms_t { * @param color ranks w/ the same color are placed in the same communicator * @param key controls rank assignment */ - std::unique_ptr comm_split(int color, int key) const - { + std::unique_ptr comm_split(int color, int key) const { return impl_->comm_split(color, key); } @@ -231,7 +204,9 @@ class comms_t { * * @param stream the cuda stream to sync collective operations on */ - status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); } + status_t sync_stream(cudaStream_t stream) const { + return impl_->sync_stream(stream); + } /** * Performs an asynchronous point-to-point send @@ -244,9 +219,10 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const - { - impl_->isend(static_cast(buf), size * sizeof(value_t), dest, tag, request); + void isend(const value_t* buf, size_t size, int dest, int tag, + request_t* request) const { + impl_->isend(static_cast(buf), size * sizeof(value_t), dest, + tag, request); } /** @@ -260,9 +236,10 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const - { - impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, request); + void irecv(value_t* buf, size_t size, int source, int tag, + request_t* request) const { + impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, + request); } /** @@ -270,8 +247,7 @@ class comms_t { * @param count number of requests to synchronize on * @param array_of_requests an array of request_t objects returned from isend/irecv */ - void waitall(int count, request_t array_of_requests[]) const - { + void waitall(int count, request_t array_of_requests[]) const { impl_->waitall(count, array_of_requests); } @@ -285,15 +261,11 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allreduce( - const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const - { + void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count, + op_t op, cudaStream_t stream) const { impl_->allreduce(static_cast(sendbuff), - static_cast(recvbuff), - count, - get_type(), - op, - stream); + static_cast(recvbuff), count, get_type(), + op, stream); } /** @@ -305,9 +277,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const - { - impl_->bcast(static_cast(buff), count, get_type(), root, stream); + void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const { + impl_->bcast(static_cast(buff), count, get_type(), root, + stream); } /** @@ -321,20 +293,11 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reduce(const value_t* sendbuff, - value_t* recvbuff, - size_t count, - op_t op, - int root, - cudaStream_t stream) const - { + void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, + int root, cudaStream_t stream) const { impl_->reduce(static_cast(sendbuff), - static_cast(recvbuff), - count, - get_type(), - op, - root, - stream); + static_cast(recvbuff), count, get_type(), op, + root, stream); } /** @@ -346,16 +309,11 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgather(const value_t* sendbuff, - value_t* recvbuff, - size_t sendcount, - cudaStream_t stream) const - { + void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, + cudaStream_t stream) const { impl_->allgather(static_cast(sendbuff), - static_cast(recvbuff), - sendcount, - get_type(), - stream); + static_cast(recvbuff), sendcount, + get_type(), stream); } /** @@ -370,18 +328,12 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgatherv(const value_t* sendbuf, - value_t* recvbuf, - const size_t* recvcounts, - const size_t* displs, - cudaStream_t stream) const - { + void allgatherv(const value_t* sendbuf, value_t* recvbuf, + const size_t* recvcounts, const size_t* displs, + cudaStream_t stream) const { impl_->allgatherv(static_cast(sendbuf), - static_cast(recvbuf), - recvcounts, - displs, - get_type(), - stream); + static_cast(recvbuf), recvcounts, displs, + get_type(), stream); } /** @@ -394,18 +346,11 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gather(const value_t* sendbuff, - value_t* recvbuff, - size_t sendcount, - int root, - cudaStream_t stream) const - { + void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, + int root, cudaStream_t stream) const { impl_->gather(static_cast(sendbuff), - static_cast(recvbuff), - sendcount, - get_type(), - root, - stream); + static_cast(recvbuff), sendcount, get_type(), + root, stream); } /** @@ -422,22 +367,12 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gatherv(const value_t* sendbuf, - value_t* recvbuf, - size_t sendcount, - const size_t* recvcounts, - const size_t* displs, - int root, - cudaStream_t stream) const - { + void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount, + const size_t* recvcounts, const size_t* displs, int root, + cudaStream_t stream) const { impl_->gatherv(static_cast(sendbuf), - static_cast(recvbuf), - sendcount, - recvcounts, - displs, - get_type(), - root, - stream); + static_cast(recvbuf), sendcount, recvcounts, displs, + get_type(), root, stream); } /** @@ -449,18 +384,11 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reducescatter(const value_t* sendbuff, - value_t* recvbuff, - size_t recvcount, - op_t op, - cudaStream_t stream) const - { + void reducescatter(const value_t* sendbuff, value_t* recvbuff, + size_t recvcount, op_t op, cudaStream_t stream) const { impl_->reducescatter(static_cast(sendbuff), - static_cast(recvbuff), - recvcount, - get_type(), - op, - stream); + static_cast(recvbuff), recvcount, + get_type(), op, stream); } /** @@ -475,9 +403,10 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const - { - impl_->device_send(static_cast(buf), size * sizeof(value_t), dest, stream); + void device_send(const value_t* buf, size_t size, int dest, + cudaStream_t stream) const { + impl_->device_send(static_cast(buf), size * sizeof(value_t), + dest, stream); } /** @@ -492,9 +421,10 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const - { - impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, stream); + void device_recv(value_t* buf, size_t size, int source, + cudaStream_t stream) const { + impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, + stream); } /** @@ -510,21 +440,12 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_sendrecv(const value_t* sendbuf, - size_t sendsize, - int dest, - value_t* recvbuf, - size_t recvsize, - int source, - cudaStream_t stream) const - { - impl_->device_sendrecv(static_cast(sendbuf), - sendsize * sizeof(value_t), - dest, - static_cast(recvbuf), - recvsize * sizeof(value_t), - source, - stream); + void device_sendrecv(const value_t* sendbuf, size_t sendsize, int dest, + value_t* recvbuf, size_t recvsize, int source, + cudaStream_t stream) const { + impl_->device_sendrecv( + static_cast(sendbuf), sendsize * sizeof(value_t), dest, + static_cast(recvbuf), recvsize * sizeof(value_t), source, stream); } /** @@ -542,37 +463,28 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_multicast_sendrecv(const value_t* sendbuf, - std::vector const& sendsizes, - std::vector const& sendoffsets, - std::vector const& dests, - value_t* recvbuf, - std::vector const& recvsizes, - std::vector const& recvoffsets, - std::vector const& sources, - cudaStream_t stream) const - { - auto sendbytesizes = sendsizes; + void device_multicast_sendrecv( + const value_t* sendbuf, std::vector const& sendsizes, + std::vector const& sendoffsets, std::vector const& dests, + value_t* recvbuf, std::vector const& recvsizes, + std::vector const& recvoffsets, std::vector const& sources, + cudaStream_t stream) const { + auto sendbytesizes = sendsizes; auto sendbyteoffsets = sendoffsets; for (size_t i = 0; i < sendsizes.size(); ++i) { sendbytesizes[i] *= sizeof(value_t); sendbyteoffsets[i] *= sizeof(value_t); } - auto recvbytesizes = recvsizes; + auto recvbytesizes = recvsizes; auto recvbyteoffsets = recvoffsets; for (size_t i = 0; i < recvsizes.size(); ++i) { recvbytesizes[i] *= sizeof(value_t); recvbyteoffsets[i] *= sizeof(value_t); } impl_->device_multicast_sendrecv(static_cast(sendbuf), - sendbytesizes, - sendbyteoffsets, - dests, - static_cast(recvbuf), - recvbytesizes, - recvbyteoffsets, - sources, - stream); + sendbytesizes, sendbyteoffsets, dests, + static_cast(recvbuf), recvbytesizes, + recvbyteoffsets, sources, stream); } private: diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index 93e31b4d6a..7b24e31bbe 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -36,9 +36,9 @@ namespace comms { * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank) -{ - auto d_alloc = handle->get_device_allocator(); +void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, + int num_ranks, int rank) { + auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); auto communicator = std::make_shared(std::unique_ptr( @@ -61,41 +61,40 @@ void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_ucx( - handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank) -{ - auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); +void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, + void *ucp_worker, void *eps, int num_ranks, + int rank) { + auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); - auto size_t_ep_arr = reinterpret_cast(eps); + auto size_t_ep_arr = reinterpret_cast(eps); for (int i = 0; i < num_ranks; i++) { - size_t ptr = size_t_ep_arr[i]; - auto ucp_ep_v = reinterpret_cast(*eps_sp); + size_t ptr = size_t_ep_arr[i]; + auto ucp_ep_v = reinterpret_cast(*eps_sp); if (ptr != 0) { auto eps_ptr = reinterpret_cast(size_t_ep_arr[i]); - ucp_ep_v[i] = eps_ptr; + ucp_ep_v[i] = eps_ptr; } else { ucp_ep_v[i] = nullptr; } } - auto d_alloc = handle->get_device_allocator(); + auto d_alloc = handle->get_device_allocator(); cudaStream_t stream = handle->get_stream(); - auto communicator = - std::make_shared(std::unique_ptr(new raft::comms::std_comms( - nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, d_alloc, stream))); + auto communicator = std::make_shared(std::unique_ptr( + new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, + num_ranks, rank, d_alloc, stream))); handle->set_comms(communicator); } -inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size) -{ +inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId, + int size) { memcpy(id->internal, uniqueId, size); } -inline void get_unique_id(char* uid, int size) -{ +inline void get_unique_id(char *uid, int size) { ncclUniqueId id; ncclGetUniqueId(&id); diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp index 65f38b2625..8dda74f0a9 100644 --- a/cpp/include/raft/comms/mpi_comms.hpp +++ b/cpp/include/raft/comms/mpi_comms.hpp @@ -32,16 +32,16 @@ #include #include -#define MPI_TRY(call) \ - do { \ - int status = call; \ - if (MPI_SUCCESS != status) { \ - int mpi_error_string_lenght = 0; \ - char mpi_error_string[MPI_MAX_ERROR_STRING]; \ - MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - RAFT_EXPECTS( \ - MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \ - } \ +#define MPI_TRY(call) \ + do { \ + int status = call; \ + if (MPI_SUCCESS != status) { \ + int mpi_error_string_lenght = 0; \ + char mpi_error_string[MPI_MAX_ERROR_STRING]; \ + MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ + RAFT_EXPECTS(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \ + #call, mpi_error_string); \ + } \ } while (0) #define MPI_TRY_NO_THROW(call) \ @@ -51,41 +51,48 @@ int mpi_error_string_lenght = 0; \ char mpi_error_string[MPI_MAX_ERROR_STRING]; \ MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - printf("MPI call='%s' at file=%s line=%d failed with %s ", \ - #call, \ - __FILE__, \ - __LINE__, \ - mpi_error_string); \ + printf("MPI call='%s' at file=%s line=%d failed with %s ", #call, \ + __FILE__, __LINE__, mpi_error_string); \ } \ } while (0) namespace raft { namespace comms { -constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) -{ +constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) { switch (datatype) { - case datatype_t::CHAR: return MPI_CHAR; - case datatype_t::UINT8: return MPI_UNSIGNED_CHAR; - case datatype_t::INT32: return MPI_INT; - case datatype_t::UINT32: return MPI_UNSIGNED; - case datatype_t::INT64: return MPI_LONG_LONG; - case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG; - case datatype_t::FLOAT32: return MPI_FLOAT; - case datatype_t::FLOAT64: return MPI_DOUBLE; + case datatype_t::CHAR: + return MPI_CHAR; + case datatype_t::UINT8: + return MPI_UNSIGNED_CHAR; + case datatype_t::INT32: + return MPI_INT; + case datatype_t::UINT32: + return MPI_UNSIGNED; + case datatype_t::INT64: + return MPI_LONG_LONG; + case datatype_t::UINT64: + return MPI_UNSIGNED_LONG_LONG; + case datatype_t::FLOAT32: + return MPI_FLOAT; + case datatype_t::FLOAT64: + return MPI_DOUBLE; default: // Execution should never reach here. This takes care of compiler warning. return MPI_DOUBLE; } } -constexpr MPI_Op get_mpi_op(const op_t op) -{ +constexpr MPI_Op get_mpi_op(const op_t op) { switch (op) { - case op_t::SUM: return MPI_SUM; - case op_t::PROD: return MPI_PROD; - case op_t::MIN: return MPI_MIN; - case op_t::MAX: return MPI_MAX; + case op_t::SUM: + return MPI_SUM; + case op_t::PROD: + return MPI_PROD; + case op_t::MIN: + return MPI_MIN; + case op_t::MAX: + return MPI_MAX; default: // Execution should never reach here. This takes care of compiler warning. return MPI_MAX; @@ -95,35 +102,38 @@ constexpr MPI_Op get_mpi_op(const op_t op) class mpi_comms : public comms_iface { public: mpi_comms(MPI_Comm comm, const bool owns_mpi_comm) - : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0) - { + : owns_mpi_comm_(owns_mpi_comm), + mpi_comm_(comm), + size_(0), + rank_(1), + next_request_id_(0) { int mpi_is_initialized = 0; MPI_TRY(MPI_Initialized(&mpi_is_initialized)); RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!"); MPI_TRY(MPI_Comm_size(mpi_comm_, &size_)); MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_)); - // get NCCL unique ID at rank 0 and broadcast it to all others + //get NCCL unique ID at rank 0 and broadcast it to all others ncclUniqueId id; if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id)); MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_)); - // initializing NCCL + //initializing NCCL NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_)); } - virtual ~mpi_comms() - { - // finalizing NCCL + virtual ~mpi_comms() { + //finalizing NCCL NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_)); - if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); } + if (owns_mpi_comm_) { + MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); + } } int get_size() const { return size_; } int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const - { + std::unique_ptr comm_split(int color, int key) const { MPI_Comm new_comm; MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm)); return std::unique_ptr(new mpi_comms(new_comm, true)); @@ -131,15 +141,15 @@ class mpi_comms : public comms_iface { void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); } - void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const - { + void isend(const void* buf, size_t size, int dest, int tag, + request_t* request) const { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req)); @@ -147,15 +157,15 @@ class mpi_comms : public comms_iface { *request = req_id; } - void irecv(void* buf, size_t size, int source, int tag, request_t* request) const - { + void irecv(void* buf, size_t size, int source, int tag, + request_t* request) const { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } @@ -164,8 +174,7 @@ class mpi_comms : public comms_iface { *request = req_id; } - void waitall(int count, request_t array_of_requests[]) const - { + void waitall(int count, request_t array_of_requests[]) const { std::vector requests; requests.reserve(count); for (int i = 0; i < count; ++i) { @@ -180,138 +189,87 @@ class mpi_comms : public comms_iface { MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE)); } - void allreduce(const void* sendbuff, - void* recvbuff, - size_t count, - datatype_t datatype, - op_t op, - cudaStream_t stream) const - { - NCCL_TRY(ncclAllReduce( - sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); + void allreduce(const void* sendbuff, void* recvbuff, size_t count, + datatype_t datatype, op_t op, cudaStream_t stream) const { + NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, + get_nccl_datatype(datatype), get_nccl_op(op), + nccl_comm_, stream)); } - void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const - { - NCCL_TRY( - ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + void bcast(void* buff, size_t count, datatype_t datatype, int root, + cudaStream_t stream) const { + NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, + nccl_comm_, stream)); } - void reduce(const void* sendbuff, - void* recvbuff, - size_t count, - datatype_t datatype, - op_t op, - int root, - cudaStream_t stream) const - { - NCCL_TRY(ncclReduce(sendbuff, - recvbuff, - count, - get_nccl_datatype(datatype), - get_nccl_op(op), - root, - nccl_comm_, - stream)); + void reduce(const void* sendbuff, void* recvbuff, size_t count, + datatype_t datatype, op_t op, int root, + cudaStream_t stream) const { + NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), + get_nccl_op(op), root, nccl_comm_, stream)); } - void allgather(const void* sendbuff, - void* recvbuff, - size_t sendcount, - datatype_t datatype, - cudaStream_t stream) const - { - NCCL_TRY(ncclAllGather( - sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, + datatype_t datatype, cudaStream_t stream) const { + NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, + get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void* sendbuf, - void* recvbuf, - const size_t* recvcounts, - const size_t* displs, - datatype_t datatype, - cudaStream_t stream) const - { - // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - - // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. + void allgatherv(const void* sendbuf, void* recvbuf, const size_t* recvcounts, + const size_t* displs, datatype_t datatype, + cudaStream_t stream) const { + //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf + //Listing 1 on page 4. for (int root = 0; root < size_; ++root) { - NCCL_TRY( - ncclBroadcast(sendbuf, - static_cast(recvbuf) + displs[root] * get_datatype_size(datatype), - recvcounts[root], - get_nccl_datatype(datatype), - root, - nccl_comm_, - stream)); + NCCL_TRY(ncclBroadcast(sendbuf, + static_cast(recvbuf) + + displs[root] * get_datatype_size(datatype), + recvcounts[root], get_nccl_datatype(datatype), + root, nccl_comm_, stream)); } } - void gather(const void* sendbuff, - void* recvbuff, - size_t sendcount, - datatype_t datatype, - int root, - cudaStream_t stream) const - { + void gather(const void* sendbuff, void* recvbuff, size_t sendcount, + datatype_t datatype, int root, cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, - sendcount, - get_nccl_datatype(datatype), - r, - nccl_comm_, - stream)); + NCCL_TRY(ncclRecv( + static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, + get_nccl_datatype(datatype), r, nccl_comm_, stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, + nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void* sendbuff, - void* recvbuff, - size_t sendcount, - const size_t* recvcounts, - const size_t* displs, - datatype_t datatype, - int root, - cudaStream_t stream) const - { + void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount, + const size_t* recvcounts, const size_t* displs, + datatype_t datatype, int root, cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, - recvcounts[r], - get_nccl_datatype(datatype), - r, - nccl_comm_, - stream)); + recvcounts[r], get_nccl_datatype(datatype), r, + nccl_comm_, stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, + nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void* sendbuff, - void* recvbuff, - size_t recvcount, - datatype_t datatype, - op_t op, - cudaStream_t stream) const - { - NCCL_TRY(ncclReduceScatter(sendbuff, - recvbuff, - recvcount, - get_nccl_datatype(datatype), - get_nccl_op(op), - nccl_comm_, - stream)); + void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, + datatype_t datatype, op_t op, cudaStream_t stream) const { + NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, + get_nccl_datatype(datatype), get_nccl_op(op), + nccl_comm_, stream)); } - status_t sync_stream(cudaStream_t stream) const - { + status_t sync_stream(cudaStream_t stream) const { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -344,58 +302,45 @@ class mpi_comms : public comms_iface { }; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const - { + void device_send(const void* buf, size_t size, int dest, + cudaStream_t stream) const { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const - { + void device_recv(void* buf, size_t size, int source, + cudaStream_t stream) const { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void* sendbuf, - size_t sendsize, - int dest, - void* recvbuf, - size_t recvsize, - int source, - cudaStream_t stream) const - { + void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, + void* recvbuf, size_t recvsize, int source, + cudaStream_t stream) const { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY( + ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } void device_multicast_sendrecv(const void* sendbuf, std::vector const& sendsizes, std::vector const& sendoffsets, - std::vector const& dests, - void* recvbuf, + std::vector const& dests, void* recvbuf, std::vector const& recvsizes, std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const - { + cudaStream_t stream) const { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], - ncclUint8, - dests[i], - nccl_comm_, - stream)); + sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], - ncclUint8, - sources[i], - nccl_comm_, + recvsizes[i], ncclUint8, sources[i], nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -413,10 +358,9 @@ class mpi_comms : public comms_iface { mutable std::unordered_set free_requests_; }; -inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) -{ - auto communicator = - std::make_shared(std::unique_ptr(new mpi_comms(comm, true))); +inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) { + auto communicator = std::make_shared( + std::unique_ptr(new mpi_comms(comm, true))); handle->set_comms(communicator); }; diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 5f80328d3f..765e8741bb 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -62,14 +62,10 @@ class std_comms : public comms_iface { * @param size size of the cluster * @param rank rank of the current worker */ - std_comms(ncclComm_t nccl_comm, - ucp_worker_h ucp_worker, - std::shared_ptr eps, - int num_ranks, - int rank, + std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, + std::shared_ptr eps, int num_ranks, int rank, const std::shared_ptr device_allocator, - cudaStream_t stream, - bool subcomms_ucp = true) + cudaStream_t stream, bool subcomms_ucp = true) : nccl_comm_(nccl_comm), stream_(stream), num_ranks_(num_ranks), @@ -78,8 +74,7 @@ class std_comms : public comms_iface { ucp_worker_(ucp_worker), ucp_eps_(eps), next_request_id_(0), - device_allocator_(device_allocator) - { + device_allocator_(device_allocator) { initialize(); }; @@ -89,9 +84,7 @@ class std_comms : public comms_iface { * @param size size of the cluster * @param rank rank of the current worker */ - std_comms(const ncclComm_t nccl_comm, - int num_ranks, - int rank, + std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, const std::shared_ptr device_allocator, cudaStream_t stream) : nccl_comm_(nccl_comm), @@ -99,37 +92,37 @@ class std_comms : public comms_iface { num_ranks_(num_ranks), rank_(rank), subcomms_ucp_(false), - device_allocator_(device_allocator) - { + device_allocator_(device_allocator) { initialize(); }; - virtual ~std_comms() - { + virtual ~std_comms() { device_allocator_->deallocate(sendbuff_, sizeof(int), stream_); device_allocator_->deallocate(recvbuff_, sizeof(int), stream_); } - void initialize() - { - sendbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); - recvbuff_ = reinterpret_cast(device_allocator_->allocate(sizeof(int), stream_)); + void initialize() { + sendbuff_ = reinterpret_cast( + device_allocator_->allocate(sizeof(int), stream_)); + recvbuff_ = reinterpret_cast( + device_allocator_->allocate(sizeof(int), stream_)); } int get_size() const { return num_ranks_; } int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const - { + std::unique_ptr comm_split(int color, int key) const { mr::device::buffer d_colors(device_allocator_, stream_, get_size()); mr::device::buffer d_keys(device_allocator_, stream_, get_size()); update_device(d_colors.data() + get_rank(), &color, 1, stream_); update_device(d_keys.data() + get_rank(), &key, 1, stream_); - allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_); - allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_); + allgather(d_colors.data() + get_rank(), d_colors.data(), 1, + datatype_t::INT32, stream_); + allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, + stream_); this->sync_stream(stream_); std::vector h_colors(get_size()); @@ -146,7 +139,9 @@ class std_comms : public comms_iface { for (int i = 0; i < get_size(); ++i) { if (h_colors[i] == color) { subcomm_ranks.push_back(i); - if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); } + if (ucp_worker_ != nullptr && subcomms_ucp_) { + new_ucx_ptrs.push_back((*ucp_eps_)[i]); + } } } @@ -155,7 +150,8 @@ class std_comms : public comms_iface { NCCL_TRY(ncclGetUniqueId(&id)); std::vector requests(subcomm_ranks.size() - 1); for (size_t i = 1; i < subcomm_ranks.size(); ++i) { - isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1)); + isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, + requests.data() + (i - 1)); } waitall(requests.size(), requests.data()); } else { @@ -170,23 +166,17 @@ class std_comms : public comms_iface { NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key)); if (ucp_worker_ != nullptr && subcomms_ucp_) { - auto eps_sp = std::make_shared(new_ucx_ptrs.data()); - return std::unique_ptr(new std_comms(nccl_comm, - (ucp_worker_h)ucp_worker_, - eps_sp, - subcomm_ranks.size(), - key, - device_allocator_, - stream_, - subcomms_ucp_)); + auto eps_sp = std::make_shared(new_ucx_ptrs.data()); + return std::unique_ptr(new std_comms( + nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key, + device_allocator_, stream_, subcomms_ucp_)); } else { - return std::unique_ptr( - new std_comms(nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_)); + return std::unique_ptr(new std_comms( + nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_)); } } - void barrier() const - { + void barrier() const { CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_)); CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_)); @@ -196,37 +186,39 @@ class std_comms : public comms_iface { "ERROR: syncStream failed. This can be caused by a failed rank_."); } - void get_request_id(request_t* req) const - { + void get_request_id(request_t *req) const { request_t req_id; if (this->free_requests_.empty()) req_id = this->next_request_id_++; else { auto it = this->free_requests_.begin(); - req_id = *it; + req_id = *it; this->free_requests_.erase(it); } *req = req_id; } - void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const - { - ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); + void isend(const void *buf, size_t size, int dest, int tag, + request_t *request) const { + ASSERT(ucp_worker_ != nullptr, + "ERROR: UCX comms not initialized on communicator."); get_request_id(request); ucp_ep_h ep_ptr = (*ucp_eps_)[dest]; - ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); + ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank()); + this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, + default_tag_mask, get_rank()); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void irecv(void* buf, size_t size, int source, int tag, request_t* request) const - { - ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); + void irecv(void *buf, size_t size, int source, int tag, + request_t *request) const { + ASSERT(ucp_worker_ != nullptr, + "ERROR: UCX comms not initialized on communicator."); get_request_id(request); @@ -234,17 +226,18 @@ class std_comms : public comms_iface { ucp_tag_t tag_mask = default_tag_mask; - ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); - ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source); + ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); + ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, + tag_mask, source); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void waitall(int count, request_t array_of_requests[]) const - { - ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); + void waitall(int count, request_t array_of_requests[]) const { + ASSERT(ucp_worker_ != nullptr, + "ERROR: UCX comms not initialized on communicator."); - std::vector requests; + std::vector requests; requests.reserve(count); time_t start = time(NULL); @@ -252,8 +245,7 @@ class std_comms : public comms_iface { for (int i = 0; i < count; ++i) { auto req_it = requests_in_flight_.find(array_of_requests[i]); ASSERT(requests_in_flight_.end() != req_it, - "ERROR: waitall on invalid request: %d", - array_of_requests[i]); + "ERROR: waitall on invalid request: %d", array_of_requests[i]); requests.push_back(req_it->second); free_requests_.insert(req_it->first); requests_in_flight_.erase(req_it); @@ -266,7 +258,8 @@ class std_comms : public comms_iface { // in 10 or more seconds. ASSERT(now - start < 10, "Timed out waiting for requests."); - for (std::vector::iterator it = requests.begin(); it != requests.end();) { + for (std::vector::iterator it = requests.begin(); + it != requests.end();) { bool restart = false; // resets the timeout when any progress was made // Causes UCP to progress through the send/recv message queue @@ -279,8 +272,10 @@ class std_comms : public comms_iface { // If the message needs release, we know it will be sent/received // asynchronously, so we will need to track and verify its state if (req->needs_release) { - ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer"); - ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req)); + ASSERT(UCS_PTR_IS_PTR(req->req), + "UCX Request Error. Request is not valid UCX pointer"); + ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", + UCS_PTR_STATUS(req->req)); ASSERT(req->req->completed == 1 || req->req->completed == 0, "request->completed not a valid value: %d\n", req->req->completed); @@ -301,143 +296,94 @@ class std_comms : public comms_iface { ++it; } // if any progress was made, reset the timeout start time - if (restart) { start = time(NULL); } + if (restart) { + start = time(NULL); + } } } } - void allreduce(const void* sendbuff, - void* recvbuff, - size_t count, - datatype_t datatype, - op_t op, - cudaStream_t stream) const - { - NCCL_TRY(ncclAllReduce( - sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); + void allreduce(const void *sendbuff, void *recvbuff, size_t count, + datatype_t datatype, op_t op, cudaStream_t stream) const { + NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, + get_nccl_datatype(datatype), get_nccl_op(op), + nccl_comm_, stream)); } - void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const - { - NCCL_TRY( - ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + void bcast(void *buff, size_t count, datatype_t datatype, int root, + cudaStream_t stream) const { + NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, + nccl_comm_, stream)); } - void reduce(const void* sendbuff, - void* recvbuff, - size_t count, - datatype_t datatype, - op_t op, - int root, - cudaStream_t stream) const - { - NCCL_TRY(ncclReduce(sendbuff, - recvbuff, - count, - get_nccl_datatype(datatype), - get_nccl_op(op), - root, - nccl_comm_, - stream)); + void reduce(const void *sendbuff, void *recvbuff, size_t count, + datatype_t datatype, op_t op, int root, + cudaStream_t stream) const { + NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), + get_nccl_op(op), root, nccl_comm_, stream)); } - void allgather(const void* sendbuff, - void* recvbuff, - size_t sendcount, - datatype_t datatype, - cudaStream_t stream) const - { - NCCL_TRY(ncclAllGather( - sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, + datatype_t datatype, cudaStream_t stream) const { + NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, + get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void* sendbuf, - void* recvbuf, - const size_t* recvcounts, - const size_t* displs, - datatype_t datatype, - cudaStream_t stream) const - { - // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - - // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. + void allgatherv(const void *sendbuf, void *recvbuf, const size_t *recvcounts, + const size_t *displs, datatype_t datatype, + cudaStream_t stream) const { + //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf + //Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclBroadcast(sendbuf, - static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], - get_nccl_datatype(datatype), - root, - nccl_comm_, - stream)); + NCCL_TRY(ncclBroadcast( + sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, + recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, + stream)); } } - void gather(const void* sendbuff, - void* recvbuff, - size_t sendcount, - datatype_t datatype, - int root, - cudaStream_t stream) const - { + void gather(const void *sendbuff, void *recvbuff, size_t sendcount, + datatype_t datatype, int root, cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, - sendcount, - get_nccl_datatype(datatype), - r, - nccl_comm_, - stream)); + NCCL_TRY(ncclRecv( + static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, + get_nccl_datatype(datatype), r, nccl_comm_, stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, + nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void* sendbuff, - void* recvbuff, - size_t sendcount, - const size_t* recvcounts, - const size_t* displs, - datatype_t datatype, - int root, - cudaStream_t stream) const - { + void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount, + const size_t *recvcounts, const size_t *displs, + datatype_t datatype, int root, cudaStream_t stream) const { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, - recvcounts[r], - get_nccl_datatype(datatype), - r, - nccl_comm_, - stream)); + NCCL_TRY(ncclRecv( + static_cast(recvbuff) + displs[r] * dtype_size, recvcounts[r], + get_nccl_datatype(datatype), r, nccl_comm_, stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, + nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void* sendbuff, - void* recvbuff, - size_t recvcount, - datatype_t datatype, - op_t op, - cudaStream_t stream) const - { - NCCL_TRY(ncclReduceScatter(sendbuff, - recvbuff, - recvcount, - get_nccl_datatype(datatype), - get_nccl_op(op), - nccl_comm_, - stream)); + void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, + datatype_t datatype, op_t op, cudaStream_t stream) const { + NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, + get_nccl_datatype(datatype), get_nccl_op(op), + nccl_comm_, stream)); } - status_t sync_stream(cudaStream_t stream) const - { + status_t sync_stream(cudaStream_t stream) const { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -470,58 +416,45 @@ class std_comms : public comms_iface { } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const - { + void device_send(const void *buf, size_t size, int dest, + cudaStream_t stream) const { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const - { + void device_recv(void *buf, size_t size, int source, + cudaStream_t stream) const { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void* sendbuf, - size_t sendsize, - int dest, - void* recvbuf, - size_t recvsize, - int source, - cudaStream_t stream) const - { + void device_sendrecv(const void *sendbuf, size_t sendsize, int dest, + void *recvbuf, size_t recvsize, int source, + cudaStream_t stream) const { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY( + ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void device_multicast_sendrecv(const void* sendbuf, - std::vector const& sendsizes, - std::vector const& sendoffsets, - std::vector const& dests, - void* recvbuf, - std::vector const& recvsizes, - std::vector const& recvoffsets, - std::vector const& sources, - cudaStream_t stream) const - { + void device_multicast_sendrecv(const void *sendbuf, + std::vector const &sendsizes, + std::vector const &sendoffsets, + std::vector const &dests, void *recvbuf, + std::vector const &recvsizes, + std::vector const &recvoffsets, + std::vector const &sources, + cudaStream_t stream) const { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { - NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], - ncclUint8, - dests[i], - nccl_comm_, - stream)); + NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], + sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { - NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], - ncclUint8, - sources[i], - nccl_comm_, + NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], + recvsizes[i], ncclUint8, sources[i], nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -540,9 +473,10 @@ class std_comms : public comms_iface { comms_ucp_handler ucp_handler_; ucp_worker_h ucp_worker_; - std::shared_ptr ucp_eps_; + std::shared_ptr ucp_eps_; mutable request_t next_request_id_; - mutable std::unordered_map requests_in_flight_; + mutable std::unordered_map + requests_in_flight_; mutable std::unordered_set free_requests_; std::shared_ptr device_allocator_; diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 86827a294e..4e95c4eef0 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -37,9 +37,8 @@ namespace comms { * @param the raft handle to use. This is expected to already have an * initialized comms instance. */ -bool test_collective_allreduce(const handle_t& handle, int root) -{ - comms_t const& communicator = handle.get_comms(); +bool test_collective_allreduce(const handle_t &handle, int root) { + comms_t const &communicator = handle.get_comms(); int const send = 1; @@ -47,12 +46,14 @@ bool test_collective_allreduce(const handle_t& handle, int root) raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream); int temp_h = 0; - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK( + cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -68,9 +69,8 @@ bool test_collective_allreduce(const handle_t& handle, int root) * @param the raft handle to use. This is expected to already have an * initialized comms instance. */ -bool test_collective_broadcast(const handle_t& handle, int root) -{ - comms_t const& communicator = handle.get_comms(); +bool test_collective_broadcast(const handle_t &handle, int root) { + comms_t const &communicator = handle.get_comms(); int const send = root; @@ -80,12 +80,14 @@ bool test_collective_broadcast(const handle_t& handle, int root) temp_d.resize(1, stream); if (communicator.get_rank() == root) - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); communicator.bcast(temp_d.data(), 1, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), + cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -95,9 +97,8 @@ bool test_collective_broadcast(const handle_t& handle, int root) return temp_h == root; } -bool test_collective_reduce(const handle_t& handle, int root) -{ - comms_t const& communicator = handle.get_comms(); +bool test_collective_reduce(const handle_t &handle, int root) { + comms_t const &communicator = handle.get_comms(); int const send = root; @@ -106,12 +107,14 @@ bool test_collective_reduce(const handle_t& handle, int root) raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), + cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -124,9 +127,8 @@ bool test_collective_reduce(const handle_t& handle, int root) return true; } -bool test_collective_allgather(const handle_t& handle, int root) -{ - comms_t const& communicator = handle.get_comms(); +bool test_collective_allgather(const handle_t &handle, int root) { + comms_t const &communicator = handle.get_comms(); int const send = communicator.get_rank(); @@ -135,16 +137,19 @@ bool test_collective_allgather(const handle_t& handle, int root) raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(1, stream); - raft::mr::device::buffer recv_d( - handle.get_device_allocator(), stream, communicator.get_size()); + raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, + communicator.get_size()); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); communicator.allgather(temp_d.data(), recv_d.data(), 1, stream); communicator.sync_stream(stream); - int temp_h[communicator.get_size()]; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync( - &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream)); + int + temp_h[communicator.get_size()]; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), + sizeof(int) * communicator.get_size(), + cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -157,9 +162,8 @@ bool test_collective_allgather(const handle_t& handle, int root) return true; } -bool test_collective_gather(const handle_t& handle, int root) -{ - comms_t const& communicator = handle.get_comms(); +bool test_collective_gather(const handle_t &handle, int root) { + comms_t const &communicator = handle.get_comms(); int const send = communicator.get_rank(); @@ -169,19 +173,20 @@ bool test_collective_gather(const handle_t& handle, int root) temp_d.resize(1, stream); raft::mr::device::buffer recv_d( - handle.get_device_allocator(), - stream, + handle.get_device_allocator(), stream, communicator.get_rank() == root ? communicator.get_size() : 0); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), + cudaMemcpyHostToDevice, stream)); communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(communicator.get_size(), 0); - CUDA_CHECK(cudaMemcpyAsync( - temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), + sizeof(int) * temp_h.size(), + cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -191,48 +196,46 @@ bool test_collective_gather(const handle_t& handle, int root) return true; } -bool test_collective_gatherv(const handle_t& handle, int root) -{ - comms_t const& communicator = handle.get_comms(); +bool test_collective_gatherv(const handle_t &handle, int root) { + comms_t const &communicator = handle.get_comms(); std::vector sendcounts(communicator.get_size()); std::iota(sendcounts.begin(), sendcounts.end(), size_t{1}); std::vector displacements(communicator.get_size() + 1, 0); - std::partial_sum(sendcounts.begin(), sendcounts.end(), displacements.begin() + 1); + std::partial_sum(sendcounts.begin(), sendcounts.end(), + displacements.begin() + 1); - std::vector sends( - displacements[communicator.get_rank() + 1] - displacements[communicator.get_rank()], - communicator.get_rank()); + std::vector sends(displacements[communicator.get_rank() + 1] - + displacements[communicator.get_rank()], + communicator.get_rank()); cudaStream_t stream = handle.get_stream(); raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream); temp_d.resize(sends.size(), stream); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), - stream, - communicator.get_rank() == root ? displacements.back() : 0); + raft::mr::device::buffer recv_d( + handle.get_device_allocator(), stream, + communicator.get_rank() == root ? displacements.back() : 0); - CUDA_CHECK(cudaMemcpyAsync( - temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), + sends.size() * sizeof(int), cudaMemcpyHostToDevice, + stream)); communicator.gatherv( - temp_d.data(), - recv_d.data(), - temp_d.size(), - communicator.get_rank() == root ? sendcounts.data() : static_cast(nullptr), - communicator.get_rank() == root ? displacements.data() : static_cast(nullptr), - root, - stream); + temp_d.data(), recv_d.data(), temp_d.size(), + communicator.get_rank() == root ? sendcounts.data() + : static_cast(nullptr), + communicator.get_rank() == root ? displacements.data() + : static_cast(nullptr), + root, stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(displacements.back(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), - recv_d.data(), + CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), sizeof(int) * displacements.back(), - cudaMemcpyDeviceToHost, - stream)); + cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -246,24 +249,28 @@ bool test_collective_gatherv(const handle_t& handle, int root) return true; } -bool test_collective_reducescatter(const handle_t& handle, int root) -{ - comms_t const& communicator = handle.get_comms(); +bool test_collective_reducescatter(const handle_t &handle, int root) { + comms_t const &communicator = handle.get_comms(); std::vector sends(communicator.get_size(), 1); cudaStream_t stream = handle.get_stream(); - raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, sends.size()); - raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, 1); + raft::mr::device::buffer temp_d(handle.get_device_allocator(), stream, + sends.size()); + raft::mr::device::buffer recv_d(handle.get_device_allocator(), stream, + 1); - CUDA_CHECK(cudaMemcpyAsync( - temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), + sends.size() * sizeof(int), cudaMemcpyHostToDevice, + stream)); - communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); + communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, + stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), + cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -280,10 +287,9 @@ bool test_collective_reducescatter(const handle_t& handle, int root) * initialized comms instance. * @param number of iterations of all-to-all messaging to perform */ -bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials) -{ - comms_t const& communicator = h.get_comms(); - int const rank = communicator.get_rank(); +bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { + comms_t const &communicator = h.get_comms(); + int const rank = communicator.get_rank(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -292,11 +298,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials) std::vector requests; requests.resize(2 * (communicator.get_size() - 1)); int request_idx = 0; - // post receives + //post receives for (int r = 0; r < communicator.get_size(); ++r) { if (r != rank) { - communicator.irecv( - received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx); + communicator.irecv(received_data.data() + request_idx, 1, r, 0, + requests.data() + request_idx); ++request_idx; } } @@ -332,7 +338,8 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials) communicator.barrier(); } - if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl; + if (communicator.get_rank() == 0) + std::cout << "=========================" << std::endl; } return ret; @@ -345,11 +352,10 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials) * initialized comms instance. * @param number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials) -{ - comms_t const& communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { + comms_t const &communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -372,9 +378,13 @@ bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials) communicator.sync_stream(stream); - if (!sender && received_data.value(stream) != rank - 1) { ret = false; } + if (!sender && received_data.value(stream) != rank - 1) { + ret = false; + } - if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } + if (communicator.get_rank() == 0) { + std::cout << "=========================" << std::endl; + } } return ret; @@ -387,11 +397,10 @@ bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials) * initialized comms instance. * @param number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials) -{ - comms_t const& communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { + comms_t const &communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -405,12 +414,12 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials) if (rank % 2 == 0) { if (rank + 1 < communicator.get_size()) { - communicator.device_sendrecv( - sent_data.data(), 1, rank + 1, received_data.data(), 1, rank + 1, stream); + communicator.device_sendrecv(sent_data.data(), 1, rank + 1, + received_data.data(), 1, rank + 1, stream); } } else { - communicator.device_sendrecv( - sent_data.data(), 1, rank - 1, received_data.data(), 1, rank - 1, stream); + communicator.device_sendrecv(sent_data.data(), 1, rank - 1, + received_data.data(), 1, rank - 1, stream); } communicator.sync_stream(stream); @@ -420,7 +429,9 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials) ret = false; } - if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } + if (communicator.get_rank() == 0) { + std::cout << "=========================" << std::endl; + } } return ret; @@ -433,11 +444,11 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials) * initialized comms instance. * @param number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials) -{ - comms_t const& communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, + int numTrials) { + comms_t const &communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -460,26 +471,25 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrial std::vector srcs(communicator.get_size()); std::iota(srcs.begin(), srcs.end(), int{0}); - communicator.device_multicast_sendrecv(sent_data.data(), - sendsizes, - sendoffsets, - dests, - received_data.data(), - recvsizes, - recvoffsets, - srcs, - stream); + communicator.device_multicast_sendrecv( + sent_data.data(), sendsizes, sendoffsets, dests, received_data.data(), + recvsizes, recvoffsets, srcs, stream); communicator.sync_stream(stream); std::vector h_received_data(communicator.get_size()); - raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream); + raft::update_host(h_received_data.data(), received_data.data(), + received_data.size(), stream); CUDA_TRY(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); ++i) { - if (h_received_data[i] != i) { ret = false; } + if (h_received_data[i] != i) { + ret = false; + } } - if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } + if (communicator.get_rank() == 0) { + std::cout << "=========================" << std::endl; + } } return ret; @@ -492,20 +502,20 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrial * initialized comms instance. * @param n_colors number of different colors to test */ -bool test_commsplit(const handle_t& h, int n_colors) -{ - comms_t const& communicator = h.get_comms(); - int const rank = communicator.get_rank(); - int const size = communicator.get_size(); +bool test_commsplit(const handle_t &h, int n_colors) { + comms_t const &communicator = h.get_comms(); + int const rank = communicator.get_rank(); + int const size = communicator.get_size(); if (n_colors > size) n_colors = size; // first we need to assign to a color, then assign the rank within the color int color = rank % n_colors; - int key = rank / n_colors; + int key = rank / n_colors; handle_t new_handle(1); - auto shared_comm = std::make_shared(communicator.comm_split(color, key)); + auto shared_comm = + std::make_shared(communicator.comm_split(color, key)); new_handle.set_comms(shared_comm); return test_collective_allreduce(new_handle, 0); diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index 89c7b25630..226b6f0527 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -25,19 +25,16 @@ namespace raft { namespace comms { -typedef void (*dlsym_print_info)(ucp_ep_h, FILE*); -typedef void (*dlsym_rec_free)(void*); +typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); +typedef void (*dlsym_rec_free)(void *); typedef int (*dlsym_worker_progress)(ucp_worker_h); -typedef ucs_status_ptr_t (*dlsym_send)( - ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t); -typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, - void*, - size_t count, - ucp_datatype_t datatype, - ucp_tag_t, - ucp_tag_t, - ucp_tag_recv_callback_t); +typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t, + ucp_datatype_t, ucp_tag_t, + ucp_send_callback_t); +typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count, + ucp_datatype_t datatype, ucp_tag_t, + ucp_tag_t, ucp_tag_recv_callback_t); /** * Standard UCX request object that will be passed @@ -58,9 +55,9 @@ struct ucx_context { */ class ucp_request { public: - struct ucx_context* req; - bool needs_release = true; - int other_rank = -1; + struct ucx_context *req; + bool needs_release = true; + int other_rank = -1; bool is_send_request = false; }; @@ -70,19 +67,18 @@ static const ucp_tag_t default_tag_mask = -1; /** * @brief Asynchronous send callback sets request to completed */ -static void send_callback(void* request, ucs_status_t status) -{ - struct ucx_context* context = (struct ucx_context*)request; - context->completed = 1; +static void send_callback(void *request, ucs_status_t status) { + struct ucx_context *context = (struct ucx_context *)request; + context->completed = 1; } /** * @brief Asynchronous recv callback sets request to completed */ -static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_t* info) -{ - struct ucx_context* context = (struct ucx_context*)request; - context->completed = 1; +static void recv_callback(void *request, ucs_status_t status, + ucp_tag_recv_info_t *info) { + struct ucx_context *context = (struct ucx_context *)request; + context->completed = 1; } /** @@ -91,8 +87,7 @@ static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_ */ class comms_ucp_handler { public: - comms_ucp_handler() - { + comms_ucp_handler() { load_ucp_handle(); load_send_func(); load_recv_func(); @@ -104,7 +99,7 @@ class comms_ucp_handler { ~comms_ucp_handler() { dlclose(ucp_handle); } private: - void* ucp_handle; + void *ucp_handle; dlsym_print_info print_info_func; dlsym_rec_free req_free_func; @@ -112,8 +107,7 @@ class comms_ucp_handler { dlsym_send send_func; dlsym_recv recv_func; - void load_ucp_handle() - { + void load_ucp_handle() { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE); if (!ucp_handle) { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE); @@ -123,56 +117,51 @@ class comms_ucp_handler { dlerror(); } - void assert_dlerror() - { - char* error = dlerror(); + void assert_dlerror() { + char *error = dlerror(); ASSERT(error == NULL, "Error loading function symbol: %s\n", error); } - void load_send_func() - { + void load_send_func() { send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb"); assert_dlerror(); } - void load_free_req_func() - { + void load_free_req_func() { req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free"); assert_dlerror(); } - void load_print_info_func() - { + void load_print_info_func() { print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info"); assert_dlerror(); } - void load_worker_progress_func() - { - worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); + void load_worker_progress_func() { + worker_progress_func = + (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); assert_dlerror(); } - void load_recv_func() - { + void load_recv_func() { recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb"); assert_dlerror(); } - ucp_tag_t build_message_tag(int rank, int tag) const - { + ucp_tag_t build_message_tag(int rank, int tag) const { // keeping the rank in the lower bits enables debugging. return ((uint32_t)tag << 31) | (uint32_t)rank; } public: - int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); } + int ucp_progress(ucp_worker_h worker) const { + return (*(worker_progress_func))(worker); + } /** * @brief Frees any memory underlying the given ucp request object */ - void free_ucp_request(ucp_request* request) const - { + void free_ucp_request(ucp_request *request) const { if (request->needs_release) { request->req->completed = 0; (*(req_free_func))(request->req); @@ -183,67 +172,56 @@ class comms_ucp_handler { /** * @brief Asynchronously send data to the given endpoint using the given tag */ - void ucp_isend(ucp_request* req, - ucp_ep_h ep_ptr, - const void* buf, - size_t size, - int tag, - ucp_tag_t tag_mask, - int rank) const - { + void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, + size_t size, int tag, ucp_tag_t tag_mask, int rank) const { ucp_tag_t ucp_tag = build_message_tag(rank, tag); - ucs_status_ptr_t send_result = - (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); - struct ucx_context* ucp_req = (struct ucx_context*)send_result; + ucs_status_ptr_t send_result = (*(send_func))( + ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); + struct ucx_context *ucp_req = (struct ucx_context *)send_result; if (UCS_PTR_IS_ERR(send_result)) { ASSERT(!UCS_PTR_IS_ERR(send_result), "unable to send UCX data message (%d)\n", UCS_PTR_STATUS(send_result)); /** - * If the request didn't fail, but it's not OK, it is in flight. - * Expect the handler to be invoked - */ + * If the request didn't fail, but it's not OK, it is in flight. + * Expect the handler to be invoked + */ } else if (UCS_PTR_STATUS(send_result) != UCS_OK) { /** - * If the request is OK, it's already been completed and we don't need to wait on it. - * The request will be a nullptr, however, so we need to create a new request - * and set it to completed to make the "waitall()" function work properly. - */ + * If the request is OK, it's already been completed and we don't need to wait on it. + * The request will be a nullptr, however, so we need to create a new request + * and set it to completed to make the "waitall()" function work properly. + */ req->needs_release = true; } else { req->needs_release = false; } - req->other_rank = rank; + req->other_rank = rank; req->is_send_request = true; - req->req = ucp_req; + req->req = ucp_req; } /** * @brief Asynchronously receive data from given endpoint with the given tag. */ - void ucp_irecv(ucp_request* req, - ucp_worker_h worker, - ucp_ep_h ep_ptr, - void* buf, - size_t size, - int tag, - ucp_tag_t tag_mask, - int sender_rank) const - { + void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr, + void *buf, size_t size, int tag, ucp_tag_t tag_mask, + int sender_rank) const { ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); ucs_status_ptr_t recv_result = - (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback); + (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, + tag_mask, recv_callback); - struct ucx_context* ucp_req = (struct ucx_context*)recv_result; + struct ucx_context *ucp_req = (struct ucx_context *)recv_result; - req->req = ucp_req; - req->needs_release = true; + req->req = ucp_req; + req->needs_release = true; req->is_send_request = false; - req->other_rank = sender_rank; + req->other_rank = sender_rank; ASSERT(!UCS_PTR_IS_ERR(recv_result), "unable to receive UCX data message (%d)\n", diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp index 1b0548fc00..f3216abc37 100644 --- a/cpp/include/raft/comms/util.hpp +++ b/cpp/include/raft/comms/util.hpp @@ -26,70 +26,88 @@ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an * exception detailing the NCCL error that occurred */ -#define NCCL_TRY(call) \ - do { \ - ncclResult_t const status = (call); \ - if (ncclSuccess != status) { \ - std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "NCCL error encountered at: ", \ - "call='%s', Reason=%d:%s", \ - #call, \ - status, \ - ncclGetErrorString(status)); \ - throw raft::logic_error(msg); \ - } \ +#define NCCL_TRY(call) \ + do { \ + ncclResult_t const status = (call); \ + if (ncclSuccess != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \ + #call, status, ncclGetErrorString(status)); \ + throw raft::logic_error(msg); \ + } \ } while (0); -#define NCCL_TRY_NO_THROW(call) \ - do { \ - ncclResult_t status = call; \ - if (ncclSuccess != status) { \ - printf("NCCL call='%s' failed. Reason:%s\n", #call, ncclGetErrorString(status)); \ - } \ +#define NCCL_TRY_NO_THROW(call) \ + do { \ + ncclResult_t status = call; \ + if (ncclSuccess != status) { \ + printf("NCCL call='%s' failed. Reason:%s\n", #call, \ + ncclGetErrorString(status)); \ + } \ } while (0) namespace raft { namespace comms { -constexpr size_t get_datatype_size(const datatype_t datatype) -{ +constexpr size_t get_datatype_size(const datatype_t datatype) { switch (datatype) { - case datatype_t::CHAR: return sizeof(char); - case datatype_t::UINT8: return sizeof(uint8_t); - case datatype_t::INT32: return sizeof(int); - case datatype_t::UINT32: return sizeof(unsigned int); - case datatype_t::INT64: return sizeof(int64_t); - case datatype_t::UINT64: return sizeof(uint64_t); - case datatype_t::FLOAT32: return sizeof(float); - case datatype_t::FLOAT64: return sizeof(double); - default: throw "Unsupported datatype"; + case datatype_t::CHAR: + return sizeof(char); + case datatype_t::UINT8: + return sizeof(uint8_t); + case datatype_t::INT32: + return sizeof(int); + case datatype_t::UINT32: + return sizeof(unsigned int); + case datatype_t::INT64: + return sizeof(int64_t); + case datatype_t::UINT64: + return sizeof(uint64_t); + case datatype_t::FLOAT32: + return sizeof(float); + case datatype_t::FLOAT64: + return sizeof(double); + default: + throw "Unsupported datatype"; } } -constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) -{ +constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { switch (datatype) { - case datatype_t::CHAR: return ncclChar; - case datatype_t::UINT8: return ncclUint8; - case datatype_t::INT32: return ncclInt; - case datatype_t::UINT32: return ncclUint32; - case datatype_t::INT64: return ncclInt64; - case datatype_t::UINT64: return ncclUint64; - case datatype_t::FLOAT32: return ncclFloat; - case datatype_t::FLOAT64: return ncclDouble; - default: throw "Unsupported datatype"; + case datatype_t::CHAR: + return ncclChar; + case datatype_t::UINT8: + return ncclUint8; + case datatype_t::INT32: + return ncclInt; + case datatype_t::UINT32: + return ncclUint32; + case datatype_t::INT64: + return ncclInt64; + case datatype_t::UINT64: + return ncclUint64; + case datatype_t::FLOAT32: + return ncclFloat; + case datatype_t::FLOAT64: + return ncclDouble; + default: + throw "Unsupported datatype"; } } -constexpr ncclRedOp_t get_nccl_op(const op_t op) -{ +constexpr ncclRedOp_t get_nccl_op(const op_t op) { switch (op) { - case op_t::SUM: return ncclSum; - case op_t::PROD: return ncclProd; - case op_t::MIN: return ncclMin; - case op_t::MAX: return ncclMax; - default: throw "Unsupported datatype"; + case op_t::SUM: + return ncclSum; + case op_t::PROD: + return ncclProd; + case op_t::MIN: + return ncclMin; + case op_t::MAX: + return ncclMax; + default: + throw "Unsupported datatype"; } } }; // namespace comms diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index 8a66eff242..14274043f5 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -36,17 +36,16 @@ namespace raft { /** helper macro for device inlined functions */ -#define DI inline __device__ +#define DI inline __device__ #define HDI inline __host__ __device__ -#define HD __host__ __device__ +#define HD __host__ __device__ /** * @brief Provide a ceiling division operation ie. ceil(a / b) * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType ceildiv(IntType a, IntType b) -{ +constexpr HDI IntType ceildiv(IntType a, IntType b) { return (a + b - 1) / b; } @@ -55,8 +54,7 @@ constexpr HDI IntType ceildiv(IntType a, IntType b) * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignTo(IntType a, IntType b) -{ +constexpr HDI IntType alignTo(IntType a, IntType b) { return ceildiv(a, b) * b; } @@ -65,8 +63,7 @@ constexpr HDI IntType alignTo(IntType a, IntType b) * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignDown(IntType a, IntType b) -{ +constexpr HDI IntType alignDown(IntType a, IntType b) { return (a / b) * b; } @@ -75,8 +72,7 @@ constexpr HDI IntType alignDown(IntType a, IntType b) * @tparam IntType data type (checked only for integers) */ template -constexpr HDI bool isPo2(IntType num) -{ +constexpr HDI bool isPo2(IntType num) { return (num && !(num & (num - 1))); } @@ -85,16 +81,14 @@ constexpr HDI bool isPo2(IntType num) * @tparam IntType data type (checked only for integers) */ template -constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) -{ +constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) { return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret); } /** Device function to apply the input lambda across threads in the grid */ template -DI void forEach(int num, L lambda) -{ - int idx = (blockDim.x * blockIdx.x) + threadIdx.x; +DI void forEach(int num, L lambda) { + int idx = (blockDim.x * blockIdx.x) + threadIdx.x; const int numThreads = blockDim.x * gridDim.x; #pragma unroll for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) { @@ -106,8 +100,7 @@ DI void forEach(int num, L lambda) static const int WarpSize = 32; /** get the laneId of the current thread */ -DI int laneId() -{ +DI int laneId() { int id; asm("mov.s32 %0, %laneid;" : "=r"(id)); return id; @@ -120,17 +113,15 @@ DI int laneId() * @param b second input */ template -HDI void swapVals(T& a, T& b) -{ +HDI void swapVals(T &a, T &b) { T tmp = a; - a = b; - b = tmp; + a = b; + b = tmp; } /** Device function to have atomic add support for older archs */ template -DI void myAtomicAdd(Type* address, Type val) -{ +DI void myAtomicAdd(Type *address, Type val) { atomicAdd(address, val); } @@ -138,114 +129,105 @@ DI void myAtomicAdd(Type* address, Type val) // Ref: // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf template <> -DI void myAtomicAdd(double* address, double val) -{ - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicAdd(double *address, double val) { + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = - atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); } #endif template -DI void myAtomicReduce(T* address, T val, ReduceLambda op); +DI void myAtomicReduce(T *address, T val, ReduceLambda op); template -DI void myAtomicReduce(double* address, double val, ReduceLambda op) -{ - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicReduce(double *address, double val, ReduceLambda op) { + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = atomicCAS( - address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed)))); + old = + atomicCAS(address_as_ull, assumed, + __double_as_longlong(op(val, __longlong_as_double(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(float* address, float val, ReduceLambda op) -{ - unsigned int* address_as_uint = (unsigned int*)address; - unsigned int old = *address_as_uint, assumed; +DI void myAtomicReduce(float *address, float val, ReduceLambda op) { + unsigned int *address_as_uint = (unsigned int *)address; + unsigned int old = *address_as_uint, assumed; do { assumed = old; - old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed)))); + old = atomicCAS(address_as_uint, assumed, + __float_as_uint(op(val, __uint_as_float(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(int* address, int val, ReduceLambda op) -{ +DI void myAtomicReduce(int *address, int val, ReduceLambda op) { int old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(long long* address, long long val, ReduceLambda op) -{ +DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) { long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op) -{ +DI void myAtomicReduce(unsigned long long *address, unsigned long long val, + ReduceLambda op) { unsigned long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, - * val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T* address, T val); +DI T myAtomicMin(T *address, T val); /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, - * val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T* address, T val); +DI T myAtomicMax(T *address, T val); -DI float myAtomicMin(float* address, float val) -{ +DI float myAtomicMin(float *address, float val) { myAtomicReduce(address, val, fminf); return *address; } -DI float myAtomicMax(float* address, float val) -{ +DI float myAtomicMax(float *address, float val) { myAtomicReduce(address, val, fmaxf); return *address; } -DI double myAtomicMin(double* address, double val) -{ +DI double myAtomicMin(double *address, double val) { myAtomicReduce(address, val, fmin); return *address; } -DI double myAtomicMax(double* address, double val) -{ +DI double myAtomicMax(double *address, double val) { myAtomicReduce(address, val, fmax); return *address; } @@ -257,13 +239,11 @@ DI double myAtomicMax(double* address, double val) template HDI T myMax(T x, T y); template <> -HDI float myMax(float x, float y) -{ +HDI float myMax(float x, float y) { return fmaxf(x, y); } template <> -HDI double myMax(double x, double y) -{ +HDI double myMax(double x, double y) { return fmax(x, y); } /** @} */ @@ -275,13 +255,11 @@ HDI double myMax(double x, double y) template HDI T myMin(T x, T y); template <> -HDI float myMin(float x, float y) -{ +HDI float myMin(float x, float y) { return fminf(x, y); } template <> -HDI double myMin(double x, double y) -{ +HDI double myMin(double x, double y) { return fmin(x, y); } /** @} */ @@ -289,13 +267,11 @@ HDI double myMin(double x, double y) /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, - * val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T* address, T val) -{ +DI T myAtomicMin(T *address, T val) { myAtomicReduce(address, val, myMin); return *address; } @@ -303,13 +279,11 @@ DI T myAtomicMin(T* address, T val) /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, - * val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T* address, T val) -{ +DI T myAtomicMax(T *address, T val) { myAtomicReduce(address, val, myMax); return *address; } @@ -318,8 +292,7 @@ DI T myAtomicMax(T* address, T val) * Sign function */ template -HDI int sgn(const T val) -{ +HDI int sgn(const T val) { return (T(0) < val) - (val < T(0)); } @@ -330,13 +303,11 @@ HDI int sgn(const T val) template HDI T myExp(T x); template <> -HDI float myExp(float x) -{ +HDI float myExp(float x) { return expf(x); } template <> -HDI double myExp(double x) -{ +HDI double myExp(double x) { return exp(x); } /** @} */ @@ -348,13 +319,11 @@ HDI double myExp(double x) template inline __device__ T myInf(); template <> -inline __device__ float myInf() -{ +inline __device__ float myInf() { return CUDART_INF_F; } template <> -inline __device__ double myInf() -{ +inline __device__ double myInf() { return CUDART_INF; } /** @} */ @@ -366,13 +335,11 @@ inline __device__ double myInf() template HDI T myLog(T x); template <> -HDI float myLog(float x) -{ +HDI float myLog(float x) { return logf(x); } template <> -HDI double myLog(double x) -{ +HDI double myLog(double x) { return log(x); } /** @} */ @@ -384,13 +351,11 @@ HDI double myLog(double x) template HDI T mySqrt(T x); template <> -HDI float mySqrt(float x) -{ +HDI float mySqrt(float x) { return sqrtf(x); } template <> -HDI double mySqrt(double x) -{ +HDI double mySqrt(double x) { return sqrt(x); } /** @} */ @@ -400,15 +365,13 @@ HDI double mySqrt(double x) * @{ */ template -DI void mySinCos(T x, T& s, T& c); +DI void mySinCos(T x, T &s, T &c); template <> -DI void mySinCos(float x, float& s, float& c) -{ +DI void mySinCos(float x, float &s, float &c) { sincosf(x, &s, &c); } template <> -DI void mySinCos(double x, double& s, double& c) -{ +DI void mySinCos(double x, double &s, double &c) { sincos(x, &s, &c); } /** @} */ @@ -420,13 +383,11 @@ DI void mySinCos(double x, double& s, double& c) template DI T mySin(T x); template <> -DI float mySin(float x) -{ +DI float mySin(float x) { return sinf(x); } template <> -DI double mySin(double x) -{ +DI double mySin(double x) { return sin(x); } /** @} */ @@ -436,18 +397,15 @@ DI double mySin(double x) * @{ */ template -DI T myAbs(T x) -{ +DI T myAbs(T x) { return x < 0 ? -x : x; } template <> -DI float myAbs(float x) -{ +DI float myAbs(float x) { return fabsf(x); } template <> -DI double myAbs(double x) -{ +DI double myAbs(double x) { return fabs(x); } /** @} */ @@ -459,13 +417,11 @@ DI double myAbs(double x) template HDI T myPow(T x, T power); template <> -HDI float myPow(float x, float power) -{ +HDI float myPow(float x, float power) { return powf(x, power); } template <> -HDI double myPow(double x, double power) -{ +HDI double myPow(double x, double power) { return pow(x, power); } /** @} */ @@ -477,13 +433,11 @@ HDI double myPow(double x, double power) template HDI T myTanh(T x); template <> -HDI float myTanh(float x) -{ +HDI float myTanh(float x) { return tanhf(x); } template <> -HDI double myTanh(double x) -{ +HDI double myTanh(double x) { return tanh(x); } /** @} */ @@ -495,13 +449,11 @@ HDI double myTanh(double x) template HDI T myATanh(T x); template <> -HDI float myATanh(float x) -{ +HDI float myATanh(float x) { return atanhf(x); } template <> -HDI double myATanh(double x) -{ +HDI double myATanh(double x) { return atanh(x); } /** @} */ @@ -540,18 +492,15 @@ struct Sum { * @{ */ template -DI T signPrim(T x) -{ +DI T signPrim(T x) { return x < 0 ? -1 : +1; } template <> -DI float signPrim(float x) -{ +DI float signPrim(float x) { return signbit(x) == true ? -1.0f : +1.0f; } template <> -DI double signPrim(double x) -{ +DI double signPrim(double x) { return signbit(x) == true ? -1.0 : +1.0; } /** @} */ @@ -565,33 +514,28 @@ DI double signPrim(double x) * @{ */ template -DI T maxPrim(T x, T y) -{ +DI T maxPrim(T x, T y) { return x > y ? x : y; } template <> -DI float maxPrim(float x, float y) -{ +DI float maxPrim(float x, float y) { return fmaxf(x, y); } template <> -DI double maxPrim(double x, double y) -{ +DI double maxPrim(double x, double y) { return fmax(x, y); } /** @} */ /** apply a warp-wide fence (useful from Volta+ archs) */ -DI void warpFence() -{ +DI void warpFence() { #if __CUDA_ARCH__ >= 700 __syncwarp(); #endif } /** warp-wide any boolean aggregator */ -DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) -{ +DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { #if CUDART_VERSION >= 9000 inFlag = __any_sync(mask, inFlag); #else @@ -601,8 +545,7 @@ DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) } /** warp-wide all boolean aggregator */ -DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) -{ +DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { #if CUDART_VERSION >= 9000 inFlag = __all_sync(mask, inFlag); #else @@ -621,8 +564,8 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) * @return the shuffled data */ template -DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu) -{ +DI T shfl(T val, int srcLane, int width = WarpSize, + uint32_t mask = 0xffffffffu) { #if CUDART_VERSION >= 9000 return __shfl_sync(mask, val, srcLane, width); #else @@ -640,8 +583,8 @@ DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu) * @return the shuffled data */ template -DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu) -{ +DI T shfl_xor(T val, int laneMask, int width = WarpSize, + uint32_t mask = 0xffffffffu) { #if CUDART_VERSION >= 9000 return __shfl_xor_sync(mask, val, laneMask, width); #else @@ -659,8 +602,7 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xfffff * @todo Expand this to support arbitrary reduction ops */ template -DI T warpReduce(T val) -{ +DI T warpReduce(T val) { #pragma unroll for (int i = WarpSize / 2; i > 0; i >>= 1) { T tmp = shfl(val, laneId() + i); @@ -681,13 +623,12 @@ DI T warpReduce(T val) * @todo Expand this to support arbitrary reduction ops */ template -DI T blockReduce(T val, char* smem) -{ - auto* sTemp = reinterpret_cast(smem); - int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; - int lid = laneId(); - int wid = threadIdx.x / WarpSize; - val = warpReduce(val); +DI T blockReduce(T val, char *smem) { + auto *sTemp = reinterpret_cast(smem); + int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; + int lid = laneId(); + int wid = threadIdx.x / WarpSize; + val = warpReduce(val); if (lid == 0) sTemp[wid] = val; __syncthreads(); val = lid < nWarps ? sTemp[lid] : T(0); @@ -703,10 +644,8 @@ DI T blockReduce(T val, char* smem) * @param idx the index for which to query the stream */ inline cudaStream_t select_stream(cudaStream_t user_stream, - cudaStream_t* int_streams, - int n_int_streams, - int idx) -{ + cudaStream_t *int_streams, int n_int_streams, + int idx) { return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream; } diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 872dab7d82..86c60addf2 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -49,20 +49,17 @@ struct cuda_error : public raft::exception { * exception detailing the CUDA error that occurred * */ -#define CUDA_TRY(call) \ - do { \ - cudaError_t const status = call; \ - if (status != cudaSuccess) { \ - cudaGetLastError(); \ - std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "CUDA error encountered at: ", \ - "call='%s', Reason=%s:%s", \ - #call, \ - cudaGetErrorName(status), \ - cudaGetErrorString(status)); \ - throw raft::cuda_error(msg); \ - } \ +#define CUDA_TRY(call) \ + do { \ + cudaError_t const status = call; \ + if (status != cudaSuccess) { \ + cudaGetLastError(); \ + std::string msg{}; \ + SET_ERROR_MSG( \ + msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \ + cudaGetErrorName(status), cudaGetErrorString(status)); \ + throw raft::cuda_error(msg); \ + } \ } while (0) /** @@ -92,16 +89,13 @@ struct cuda_error : public raft::exception { // * @brief check for cuda runtime API errors but log error instead of raising // * exception. // */ -#define CUDA_CHECK_NO_THROW(call) \ - do { \ - cudaError_t const status = call; \ - if (cudaSuccess != status) { \ - printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ - #call, \ - __FILE__, \ - __LINE__, \ - cudaGetErrorString(status)); \ - } \ +#define CUDA_CHECK_NO_THROW(call) \ + do { \ + cudaError_t const status = call; \ + if (cudaSuccess != status) { \ + printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \ + __FILE__, __LINE__, cudaGetErrorString(status)); \ + } \ } while (0) namespace raft { @@ -109,7 +103,9 @@ namespace raft { /** Helper method to get to know warp size in device code */ __host__ __device__ constexpr inline int warp_size() { return 32; } -__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; } +__host__ __device__ constexpr inline unsigned int warp_full_mask() { + return 0xffffffff; +} /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping @@ -128,16 +124,13 @@ class grid_1d_thread_t { * @param elements_per_thread Typically, a single kernel thread processes more than a single * element; this affects the number of threads the grid must contain */ - grid_1d_thread_t(size_t overall_num_elements, - size_t num_threads_per_block, - size_t max_num_blocks_1d, - size_t elements_per_thread = 1) + grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block, + size_t max_num_blocks_1d, size_t elements_per_thread = 1) : block_size(num_threads_per_block), - num_blocks( - std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) / - (elements_per_thread * num_threads_per_block), - max_num_blocks_1d)) - { + num_blocks(std::min((overall_num_elements + + (elements_per_thread * num_threads_per_block) - 1) / + (elements_per_thread * num_threads_per_block), + max_num_blocks_1d)) { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -160,14 +153,13 @@ class grid_1d_warp_t { * specific features (amount of shared memory necessary, SM functional units use pattern etc.); * this can't be determined generically/automatically (as opposed to the number of blocks) */ - grid_1d_warp_t(size_t overall_num_elements, - size_t num_threads_per_block, + grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) / - (num_threads_per_block / warp_size()), - max_num_blocks_1d)) - { + num_blocks(std::min( + (overall_num_elements + (num_threads_per_block / warp_size()) - 1) / + (num_threads_per_block / warp_size()), + max_num_blocks_1d)) { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -189,12 +181,10 @@ class grid_1d_block_t { * specific features (amount of shared memory necessary, SM functional units use pattern etc.); * this can't be determined generically/automatically (as opposed to the number of blocks) */ - grid_1d_block_t(size_t overall_num_elements, - size_t num_threads_per_block, + grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) - { + num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -210,9 +200,9 @@ class grid_1d_block_t { * @param stream cuda stream */ template -void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) -{ - CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); +void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) { + CUDA_CHECK( + cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); } /** @@ -223,22 +213,23 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) */ /** performs a host to device copy */ template -void update_device(Type* d_ptr, const Type* h_ptr, size_t len, cudaStream_t stream) -{ +void update_device(Type* d_ptr, const Type* h_ptr, size_t len, + cudaStream_t stream) { copy(d_ptr, h_ptr, len, stream); } /** performs a device to host copy */ template -void update_host(Type* h_ptr, const Type* d_ptr, size_t len, cudaStream_t stream) -{ +void update_host(Type* h_ptr, const Type* d_ptr, size_t len, + cudaStream_t stream) { copy(h_ptr, d_ptr, len, stream); } template -void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, cudaStream_t stream) -{ - CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); +void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, + cudaStream_t stream) { + CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), + cudaMemcpyDeviceToDevice, stream)); } /** @} */ @@ -247,11 +238,8 @@ void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, cudaStream_t strea * @{ */ template -void print_host_vector(const char* variable_name, - const T* host_mem, - size_t componentsCount, - OutStream& out) -{ +void print_host_vector(const char* variable_name, const T* host_mem, + size_t componentsCount, OutStream& out) { out << variable_name << "=["; for (size_t i = 0; i < componentsCount; ++i) { if (i != 0) out << ","; @@ -261,13 +249,11 @@ void print_host_vector(const char* variable_name, } template -void print_device_vector(const char* variable_name, - const T* devMem, - size_t componentsCount, - OutStream& out) -{ +void print_device_vector(const char* variable_name, const T* devMem, + size_t componentsCount, OutStream& out) { T* host_mem = new T[componentsCount]; - CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), + cudaMemcpyDeviceToHost)); print_host_vector(variable_name, host_mem, componentsCount, out); delete[] host_mem; } @@ -275,36 +261,35 @@ void print_device_vector(const char* variable_name, /** cuda malloc */ template -void allocate(Type*& ptr, size_t len, bool setZero = false) -{ +void allocate(Type*& ptr, size_t len, bool setZero = false) { CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len)); if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len)); } /** helper method to get max usable shared mem per block parameter */ -inline int getSharedMemPerBlock() -{ +inline int getSharedMemPerBlock() { int devId; CUDA_CHECK(cudaGetDevice(&devId)); int smemPerBlk; - CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId)); + CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, + cudaDevAttrMaxSharedMemoryPerBlock, devId)); return smemPerBlk; } /** helper method to get multi-processor count parameter */ -inline int getMultiProcessorCount() -{ +inline int getMultiProcessorCount() { int devId; CUDA_CHECK(cudaGetDevice(&devId)); int mpCount; - CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); + CUDA_CHECK( + cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); return mpCount; } /** helper method to convert an array on device to a string on host */ template -std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4) -{ +std::string arr2Str(const T* arr, int size, std::string name, + cudaStream_t stream, int width = 4) { std::stringstream ss; T* arr_h = (T*)malloc(size * sizeof(T)); @@ -326,54 +311,53 @@ std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t strea /** this seems to be unused, but may be useful in the future */ template -void ASSERT_DEVICE_MEM(T* ptr, std::string name) -{ +void ASSERT_DEVICE_MEM(T* ptr, std::string name) { cudaPointerAttributes s_att; cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); if (s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device - << ", err=" << s_err << std::endl; + std::cout << "Invalid device pointer encountered in " << name + << ". device=" << s_att.device << ", err=" << s_err << std::endl; } -inline uint32_t curTimeMillis() -{ - auto now = std::chrono::high_resolution_clock::now(); +inline uint32_t curTimeMillis() { + auto now = std::chrono::high_resolution_clock::now(); auto duration = now.time_since_epoch(); - return std::chrono::duration_cast(duration).count(); + return std::chrono::duration_cast(duration) + .count(); } /** Helper function to calculate need memory for allocate to store dense matrix. - * @param rows number of rows in matrix - * @param columns number of columns in matrix - * @return need number of items to allocate via allocate() - * @sa allocate() - */ -inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; } + * @param rows number of rows in matrix + * @param columns number of columns in matrix + * @return need number of items to allocate via allocate() + * @sa allocate() + */ +inline size_t allocLengthForMatrix(size_t rows, size_t columns) { + return rows * columns; +} /** Helper function to check alignment of pointer. - * @param ptr the pointer to check - * @param alignment to be checked for - * @return true if address in bytes is a multiple of alignment - */ + * @param ptr the pointer to check + * @param alignment to be checked for + * @return true if address in bytes is a multiple of alignment + */ template -bool is_aligned(Type* ptr, size_t alignment) -{ +bool is_aligned(Type* ptr, size_t alignment) { return reinterpret_cast(ptr) % alignment == 0; } /** calculate greatest common divisor of two numbers - * @a integer - * @b integer - * @ return gcd of a and b - */ +* @a integer +* @b integer +* @ return gcd of a and b +*/ template -IntType gcd(IntType a, IntType b) -{ +IntType gcd(IntType a, IntType b) { while (b != 0) { IntType tmp = b; - b = a % b; - a = tmp; + b = a % b; + a = tmp; } return a; } diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh index e113ca92eb..dc8093ca1d 100644 --- a/cpp/include/raft/device_atomics.cuh +++ b/cpp/include/raft/device_atomics.cuh @@ -39,9 +39,9 @@ namespace detail { /* @brief binary `sum` operator */ struct DeviceSum { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) { return lhs + rhs; } }; @@ -49,8 +49,7 @@ struct DeviceSum { /* @brief binary `min` operator */ struct DeviceMin { template - __device__ T operator()(const T& lhs, const T& rhs) - { + __device__ T operator()(const T& lhs, const T& rhs) { return lhs < rhs ? lhs : rhs; } }; @@ -58,44 +57,43 @@ struct DeviceMin { /* @brief binary `max` operator */ struct DeviceMax { template - __device__ T operator()(const T& lhs, const T& rhs) - { + __device__ T operator()(const T& lhs, const T& rhs) { return lhs > rhs ? lhs : rhs; } }; /* @brief binary `product` operator */ struct DeviceProduct { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) { return lhs * rhs; } }; /* @brief binary `and` operator */ struct DeviceAnd { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) { return (lhs & rhs); } }; /* @brief binary `or` operator */ struct DeviceOr { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) { return (lhs | rhs); } }; /* @brief binary `xor` operator */ struct DeviceXor { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) - { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) { return (lhs ^ rhs); } }; @@ -105,9 +103,9 @@ struct DeviceXor { #define errmsg_cast "size mismatch." template -__forceinline__ __device__ T_output type_reinterpret(T_input value) -{ - static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size"); +__forceinline__ __device__ T_output type_reinterpret(T_input value) { + static_assert(sizeof(T_output) == sizeof(T_input), + "type_reinterpret for different size"); return *(reinterpret_cast(&value)); } @@ -120,22 +118,25 @@ struct genericAtomicOperationImpl; // single byte atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + Op op) { using T_int = unsigned int; - T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); - T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = + reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = T((old >> shift) & 0xff); - uint8_t updating_value = type_reinterpret(op(target_value, update_value)); - T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = T((old >> shift) & 0xff); + uint8_t updating_value = + type_reinterpret(op(target_value, update_value)); + T_int new_value = + (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return T((old >> shift) & 0xff); @@ -145,24 +146,26 @@ struct genericAtomicOperationImpl { // 2 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) - { - using T_int = unsigned int; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + Op op) { + using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = - reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = reinterpret_cast( + reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - uint16_t updating_value = type_reinterpret(op(target_value, update_value)); - - T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value - : (old & 0xffff) | (T_int(updating_value) << 16); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); + uint16_t updating_value = + type_reinterpret(op(target_value, update_value)); + + T_int new_value = (is_32_align) + ? (old & 0xffff0000) | updating_value + : (old & 0xffff) | (T_int(updating_value) << 16); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return (is_32_align) ? T(old & 0xffff) : T(old >> 16); @@ -173,15 +176,15 @@ struct genericAtomicOperationImpl { // 4 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + Op op) { using T_int = unsigned int; T old_value = *addr; T assumed{old_value}; do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -198,8 +201,8 @@ struct genericAtomicOperationImpl { // 8 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + Op op) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -207,7 +210,7 @@ struct genericAtomicOperationImpl { T assumed{old_value}; do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -223,8 +226,8 @@ struct genericAtomicOperationImpl { // ------------------------------------------------------------------------------------------------- // specialized functions for operators -// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is -// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int +// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is not supproted.) +// `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int // CUDA natively supports `unsigned long long int` for `atomicAdd`, @@ -237,11 +240,12 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + DeviceSum op) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), + type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -249,11 +253,12 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + DeviceSum op) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), + type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -268,11 +273,12 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + DeviceSum op) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), + type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -280,11 +286,12 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + DeviceMin op) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMin(reinterpret_cast(addr), type_reinterpret(update_value)); + T ret = atomicMin(reinterpret_cast(addr), + type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -292,44 +299,48 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + DeviceMax op) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMax(reinterpret_cast(addr), type_reinterpret(update_value)); + T ret = atomicMax(reinterpret_cast(addr), + type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + DeviceAnd op) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAnd(reinterpret_cast(addr), type_reinterpret(update_value)); + T_int ret = atomicAnd(reinterpret_cast(addr), + type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + DeviceOr op) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicOr(reinterpret_cast(addr), type_reinterpret(update_value)); + T_int ret = atomicOr(reinterpret_cast(addr), + type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) - { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, + DeviceXor op) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicXor(reinterpret_cast(addr), type_reinterpret(update_value)); + T_int ret = atomicXor(reinterpret_cast(addr), + type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -342,12 +353,13 @@ struct typesAtomicCASImpl; template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) - { + __forceinline__ __device__ T operator()(T* addr, T const& compare, + T const& update_value) { using T_int = unsigned int; - T_int shift = ((reinterpret_cast(addr) & 3) * 8); - T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = + reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); // the 'target_value' in `old` can be different from `compare` // because other thread may update the value @@ -358,14 +370,15 @@ struct typesAtomicCASImpl { uint8_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = T((old >> shift) & 0xff); // have to compare `target_value` and `compare` before calling atomicCAS // the `target_value` in `old` can be different with `compare` if (target_value != compare) break; - T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + T_int new_value = + (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return target_value; @@ -374,13 +387,13 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) - { + __forceinline__ __device__ T operator()(T* addr, T const& compare, + T const& update_value) { using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = - reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = reinterpret_cast( + reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; @@ -388,12 +401,12 @@ struct typesAtomicCASImpl { uint16_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); if (target_value != compare) break; - T_int new_value = - (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16); + T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val + : (old & 0xffff) | (T_int(u_val) << 16); old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); @@ -403,8 +416,8 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) - { + __forceinline__ __device__ T operator()(T* addr, T const& compare, + T const& update_value) { using T_int = unsigned int; T_int ret = atomicCAS(reinterpret_cast(addr), @@ -418,8 +431,8 @@ struct typesAtomicCASImpl { // 8 bytes atomic operation template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) - { + __forceinline__ __device__ T operator()(T* addr, T const& compare, + T const& update_value) { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -451,10 +464,11 @@ struct typesAtomicCASImpl { * @returns The old value at `address` * -------------------------------------------------------------------------**/ template -typename std::enable_if_t::value, T> __forceinline__ __device__ -genericAtomicOperation(T* address, T const& update_value, BinaryOp op) -{ - auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; +typename std::enable_if_t::value, T> __forceinline__ + __device__ + genericAtomicOperation(T* address, T const& update_value, BinaryOp op) { + auto fun = + raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -462,11 +476,11 @@ genericAtomicOperation(T* address, T const& update_value, BinaryOp op) template __forceinline__ __device__ bool genericAtomicOperation(bool* address, bool const& update_value, - BinaryOp op) -{ + BinaryOp op) { using T = bool; // don't use underlying type to apply operation for bool - auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; + auto fun = + raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -488,9 +502,9 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicAdd(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{}); +__forceinline__ __device__ T atomicAdd(T* address, T val) { + return raft::genericAtomicOperation( + address, val, raft::device_atomics::detail::DeviceSum{}); } /** @@ -509,9 +523,9 @@ __forceinline__ __device__ T atomicAdd(T* address, T val) * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMin(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{}); +__forceinline__ __device__ T atomicMin(T* address, T val) { + return raft::genericAtomicOperation( + address, val, raft::device_atomics::detail::DeviceMin{}); } /** @@ -530,9 +544,9 @@ __forceinline__ __device__ T atomicMin(T* address, T val) * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMax(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{}); +__forceinline__ __device__ T atomicMax(T* address, T val) { + return raft::genericAtomicOperation( + address, val, raft::device_atomics::detail::DeviceMax{}); } /** @@ -552,9 +566,9 @@ __forceinline__ __device__ T atomicMax(T* address, T val) * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) -{ - return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, val); +__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { + return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, + val); } /** @@ -572,10 +586,11 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicAnd(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicAnd(T* address, T val) { + return raft::genericAtomicOperation( + address, val, raft::device_atomics::detail::DeviceAnd{}); } /** @@ -593,10 +608,11 @@ __forceinline__ __device__ T atomicAnd(T* address, T val) * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicOr(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicOr(T* address, T val) { + return raft::genericAtomicOperation(address, val, + raft::device_atomics::detail::DeviceOr{}); } /** @@ -614,8 +630,9 @@ __forceinline__ __device__ T atomicOr(T* address, T val) * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicXor(T* address, T val) -{ - return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicXor(T* address, T val) { + return raft::genericAtomicOperation( + address, val, raft::device_atomics::detail::DeviceXor{}); } diff --git a/cpp/include/raft/distance/canberra.cuh b/cpp/include/raft/distance/canberra.cuh index 61622d7c87..b87c295eb0 100644 --- a/cpp/include/raft/distance/canberra.cuh +++ b/cpp/include/raft/distance/canberra.cuh @@ -44,108 +44,75 @@ namespace distance { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch work */ -template -static void canberraImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, + IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef typename std::conditional::type KPolicy; + typedef + typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - const auto add = raft::myAbs(x) + raft::myAbs(y); + const auto add = raft::myAbs(x) + raft::myAbs(y); // deal with potential for 0 in denominator by // forcing 1/0 instead acc += ((add != 0) * diff / (add + (add == 0))); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto canberraRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); + auto canberraRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); canberraRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } else { - auto canberraColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); + auto canberraColMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); canberraColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void canberra(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op, + cudaStream_t stream) { size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - canberraImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - canberraImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); } else { canberraImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -170,25 +137,16 @@ void canberra(IdxT m, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void canberraImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ +template +void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { typedef std::is_same is_bool; - typedef typename std::conditional::type canberraOutType; + typedef typename std::conditional::type + canberraOutType; Index_ lda, ldb, ldd; - canberraOutType* pDcast = reinterpret_cast(pD); + canberraOutType *pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; canberra( diff --git a/cpp/include/raft/distance/chebyshev.cuh b/cpp/include/raft/distance/chebyshev.cuh index b7ecdb945b..8d53408cf8 100644 --- a/cpp/include/raft/distance/chebyshev.cuh +++ b/cpp/include/raft/distance/chebyshev.cuh @@ -44,105 +44,72 @@ namespace distance { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void chebyshevImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, + IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef typename std::conditional::type KPolicy; + typedef + typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - acc = raft::myMax(acc, diff); + acc = raft::myMax(acc, diff); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto chebyshevRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevRowMajor); + auto chebyshevRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + chebyshevRowMajor); chebyshevRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } else { - auto chebyshevColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevColMajor); + auto chebyshevColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + chebyshevColMajor); chebyshevColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void chebyshev(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - chebyshevImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - chebyshevImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); } else { chebyshevImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -167,25 +134,16 @@ void chebyshev(IdxT m, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void chebyshevImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ +template +void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { typedef std::is_same is_bool; - typedef typename std::conditional::type chebyshevOutType; + typedef typename std::conditional::type + chebyshevOutType; Index_ lda, ldb, ldd; - chebyshevOutType* pDcast = reinterpret_cast(pD); + chebyshevOutType *pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; chebyshev( diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh index 3e034e15d2..ed9bd28b7f 100644 --- a/cpp/include/raft/distance/cosine.cuh +++ b/cpp/include/raft/distance/cosine.cuh @@ -24,7 +24,7 @@ namespace distance { /** * @brief the cosine distance matrix calculation implementer - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -49,43 +49,30 @@ namespace distance { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void cosineImpl(const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, + const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, + IdxT ldd, OutT *dOutput, FinalLambda fin_op, + cudaStream_t stream) { typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef typename std::conditional::type KPolicy; + typedef + typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + acc += x * y; + }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -98,66 +85,43 @@ void cosineImpl(const DataT* x, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto cosineRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); + auto cosineRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); cosineRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); } else { - auto cosineColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); + auto cosineColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); cosineColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void cosine(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, const DataT *xn, const DataT *yn, + OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) { size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - cosineImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, + fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - cosineImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, + fin_op, stream); } else { cosineImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -166,7 +130,7 @@ void cosine(IdxT m, /** * @brief the expanded cosine distance matrix calculation - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam IType input data-type (for A and B matrices) * @tparam AccType accumulation data-type @@ -187,23 +151,12 @@ void cosine(IdxT m, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void cosineAlgo1(Index_ m, - Index_ n, - Index_ k, - const InType* pA, - const InType* pB, - OutType* pD, - AccType* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ +template +void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, + const InType *pB, OutType *pD, AccType *workspace, + size_t worksize, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); }; // Wrap fin_op to allow computing 1 - pA before calling fin_op @@ -212,33 +165,39 @@ void cosineAlgo1(Index_ m, }; typedef std::is_same is_bool; - typedef typename std::conditional::type CosOutType; - CosOutType* pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type + CosOutType; + CosOutType *pDcast = reinterpret_cast(pD); - ASSERT( - !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || + (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType* col_vec = workspace; - InType* row_vec = workspace; + InType *col_vec = workspace; + InType *row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, + stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, + stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, + stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; cosine( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, + stream); } else { lda = n, ldb = m, ldd = m; - cosine( - n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream); + cosine(n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, + wrapped_fin_op, stream); } } diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh index 1627753b43..1b39a6ec18 100644 --- a/cpp/include/raft/distance/distance.cuh +++ b/cpp/include/raft/distance/distance.cuh @@ -32,314 +32,140 @@ namespace raft { namespace distance { namespace { -template struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg = 2.0f) - { - } + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg = 2.0f) {} }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { - raft::distance::euclideanAlgo1( - m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::euclideanAlgo1(m, n, k, x, y, dist, false, + (AccType *)workspace, worksize, + fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { - raft::distance::euclideanAlgo1( - m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::euclideanAlgo1(m, n, k, x, y, dist, true, + (AccType *)workspace, worksize, + fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { raft::distance::cosineAlgo1( - m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); + m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream, + isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { - raft::distance::euclideanAlgo2( - m, n, k, x, y, dist, false, fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::euclideanAlgo2(m, n, k, x, y, dist, false, fin_op, + stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { - raft::distance::euclideanAlgo2( - m, n, k, x, y, dist, true, fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::euclideanAlgo2(m, n, k, x, y, dist, true, fin_op, + stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { raft::distance::l1Impl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { - raft::distance::chebyshevImpl( - m, n, k, x, y, dist, fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::chebyshevImpl(m, n, k, x, y, dist, fin_op, stream, + isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { - raft::distance::hellingerImpl( - m, n, k, x, y, dist, fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::hellingerImpl(m, n, k, x, y, dist, fin_op, stream, + isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { - raft::distance::minkowskiImpl( - m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg); +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { + raft::distance::minkowskiImpl(m, n, k, x, y, dist, fin_op, stream, + isRowMajor, metric_arg); } }; -template -struct DistanceImpl { - void run(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) - { +template +struct DistanceImpl { + void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, + Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { raft::distance::canberraImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } @@ -363,15 +189,13 @@ struct DistanceImpl -size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k) -{ - size_t worksize = 0; - constexpr bool is_allocated = distanceType <= raft::distance::DistanceType::CosineExpanded; +template +size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, + Index_ k) { + size_t worksize = 0; + constexpr bool is_allocated = + distanceType <= raft::distance::DistanceType::CosineExpanded; if (is_allocated) { worksize += m * sizeof(AccType); if (x != y) worksize += n * sizeof(AccType); @@ -404,27 +228,17 @@ size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, In * as follows:

OutType fin_op(AccType in, int g_idx);
. If one needs * any other parameters, feel free to pass them via closure. */ -template -void distance(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor = true, - InType metric_arg = 2.0f) -{ - DistanceImpl distImpl; - distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); +void distance(const InType *x, const InType *y, OutType *dist, Index_ m, + Index_ n, Index_ k, void *workspace, size_t worksize, + FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true, + InType metric_arg = 2.0f) { + DistanceImpl + distImpl; + distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, + isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -449,26 +263,18 @@ void distance(const InType* x, * @note if workspace is passed as nullptr, this will return in * worksize, the number of bytes of workspace required */ -template -void distance(const InType* x, - const InType* y, - OutType* dist, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - cudaStream_t stream, - bool isRowMajor = true, - InType metric_arg = 2.0f) -{ - auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; }; - distance( - x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg); +template +void distance(const InType *x, const InType *y, OutType *dist, Index_ m, + Index_ n, Index_ k, void *workspace, size_t worksize, + cudaStream_t stream, bool isRowMajor = true, + InType metric_arg = 2.0f) { + auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { + return d_val; + }; + distance(x, y, dist, m, n, k, workspace, worksize, default_fin_op, + stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -492,47 +298,39 @@ void distance(const InType* x, * @param isRowMajor whether the matrices are row-major or col-major */ template -void pairwise_distance_impl(const Type* x, - const Type* y, - Type* dist, - Index_ m, - Index_ n, - Index_ k, - raft::mr::device::buffer& workspace, - cudaStream_t stream, - bool isRowMajor, - Type metric_arg = 2.0f) -{ - auto worksize = getWorkspaceSize(x, y, m, n, k); +void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, + Index_ n, Index_ k, + raft::mr::device::buffer &workspace, + cudaStream_t stream, bool isRowMajor, + Type metric_arg = 2.0f) { + auto worksize = + getWorkspaceSize(x, y, m, n, k); workspace.resize(worksize, stream); - distance( - x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg); + distance(x, y, dist, m, n, k, + workspace.data(), worksize, + stream, isRowMajor, metric_arg); } template -void pairwise_distance(const Type* x, - const Type* y, - Type* dist, - Index_ m, - Index_ n, - Index_ k, - raft::mr::device::buffer& workspace, - raft::distance::DistanceType metric, - cudaStream_t stream, - bool isRowMajor = true, - Type metric_arg = 2.0f) -{ +void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m, + Index_ n, Index_ k, + raft::mr::device::buffer &workspace, + raft::distance::DistanceType metric, cudaStream_t stream, + bool isRowMajor = true, Type metric_arg = 2.0f) { switch (metric) { case raft::distance::DistanceType::L2Expanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L2SqrtExpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L1: @@ -540,11 +338,13 @@ void pairwise_distance(const Type* x, x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L2Unexpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::Linf: @@ -552,18 +352,22 @@ void pairwise_distance(const Type* x, x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor, metric_arg); break; case raft::distance::DistanceType::Canberra: - pairwise_distance_impl( + pairwise_distance_impl( x, y, dist, m, n, k, workspace, stream, isRowMajor); break; - default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); + default: + THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; } /** @} */ diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh index 46d0a1a4a9..484da0e5bf 100644 --- a/cpp/include/raft/distance/euclidean.cuh +++ b/cpp/include/raft/distance/euclidean.cuh @@ -48,44 +48,30 @@ namespace distance { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void euclideanExpImpl(const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - bool sqrt, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, + const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, + IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef typename std::conditional::type KPolicy; + typedef + typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { + acc += x * y; + }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -107,68 +93,47 @@ void euclideanExpImpl(const DataT* x, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto euclideanExpRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); + auto euclideanExpRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); euclideanExpRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); } else { - auto euclideanExpColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); + auto euclideanExpColMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); euclideanExpColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, + fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanExp(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - bool sqrt, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, const DataT *xn, + const DataT *yn, bool sqrt, OutT *dOutput, FinalLambda fin_op, + cudaStream_t stream) { size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanExpImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); + euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, + dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanExpImpl( - x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); + euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, + dOutput, fin_op, stream); } else { euclideanExpImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -196,59 +161,53 @@ void euclideanExp(IdxT m, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo1(Index_ m, - Index_ n, - Index_ k, - const InType* pA, - const InType* pB, - OutType* pD, - bool enable_sqrt, - AccType* workspace, - size_t& worksize, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ +template +void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, + const InType *pB, OutType *pD, bool enable_sqrt, + AccType *workspace, size_t &worksize, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor) { auto norm_op = [] __device__(InType in) { return in; }; typedef std::is_same is_bool; - typedef typename std::conditional::type ExpOutType; - ExpOutType* pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type + ExpOutType; + ExpOutType *pDcast = reinterpret_cast(pD); - ASSERT( - !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || + (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType* col_vec = workspace; - InType* row_vec = workspace; + InType *col_vec = workspace; + InType *row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, + stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, + stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, + stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; euclideanExp( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, + fin_op, stream); } else { lda = n, ldb = m, ldd = m; euclideanExp( - n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream); + n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, + fin_op, stream); } } /** - * @brief the unexpanded euclidean distance matrix calculation + * @brief the unexpanded euclidean distance matrix calculation * It computes the following equation: cij = op((ai-bj)^2) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -268,30 +227,16 @@ void euclideanAlgo1(Index_ m, * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -void euclideanUnExpImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - bool sqrt, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, + IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef typename std::conditional::type KPolicy; + typedef + typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -302,11 +247,10 @@ void euclideanUnExpImpl(const DataT* x, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { if (sqrt) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -319,68 +263,48 @@ void euclideanUnExpImpl(const DataT* x, }; if (isRowMajor) { - auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor); + auto euclideanUnExpRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + euclideanUnExpRowMajor); euclideanUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } else { - auto euclideanUnExpColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpColMajor); + auto euclideanUnExpColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + euclideanUnExpColMajor); euclideanUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanUnExp(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - bool sqrt, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, bool sqrt, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanUnExpImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); + euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, + fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanUnExpImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); + euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, + fin_op, stream); } else { euclideanUnExpImpl( x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -406,25 +330,15 @@ void euclideanUnExp(IdxT m, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo2(Index_ m, - Index_ n, - Index_ k, - const InType* pA, - const InType* pB, - OutType* pD, - bool enable_sqrt, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ +template +void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA, + const InType *pB, OutType *pD, bool enable_sqrt, + FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) { typedef std::is_same is_bool; - typedef typename std::conditional::type UnExpOutType; - UnExpOutType* pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type + UnExpOutType; + UnExpOutType *pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index f80b4eb8f7..b96a536e38 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -35,24 +35,24 @@ template struct KVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { + return b.value < a.value ? b : a; + } }; // KVPMinReduce template struct MinAndDistanceReduceOp { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, KVP* out, const KVP& other) - { + DI void operator()(LabelT rid, KVP* out, const KVP& other) { if (other.value < out->value) { - out->key = other.key; + out->key = other.key; out->value = other.value; } } - DI void init(KVP* out, DataT maxVal) - { - out->key = -1; + DI void init(KVP* out, DataT maxVal) { + out->key = -1; out->value = maxVal; } }; @@ -60,28 +60,30 @@ struct MinAndDistanceReduceOp { template struct MinReduceOp { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, DataT* out, const KVP& other) - { - if (other.value < *out) { *out = other.value; } + DI void operator()(LabelT rid, DataT* out, const KVP& other) { + if (other.value < *out) { + *out = other.value; + } } DI void init(DataT* out, DataT maxVal) { *out = maxVal; } }; template -__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) -{ +__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < m) { redOp.init(min + tid, maxVal); } + if (tid < m) { + redOp.init(min + tid, maxVal); + } } // TODO: specialize this function for MinAndDistanceReduceOp // with atomicCAS of 64 bit which will eliminate mutex and shfls -template -DI void updateReducedVal( - int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY) -{ - const auto lid = threadIdx.x % raft::WarpSize; +template +DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, + IdxT m, IdxT gridStrideY) { + const auto lid = threadIdx.x % raft::WarpSize; const auto accrowid = threadIdx.x / P::AccThCols; // for now have first lane from each warp update a unique output row. This @@ -106,38 +108,21 @@ DI void updateReducedVal( if (j < (raft::WarpSize / P::AccThCols) - 1) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); + auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols); - val[i] = {tmpkey, tmpvalue}; + val[i] = {tmpkey, tmpvalue}; } } } } -template -__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - DataT maxVal, - int* mutex, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - CoreLambda core_op, - FinalLambda fin_op) -{ +__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( + OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn, + IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp, + KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) { extern __shared__ char smem[]; typedef cub::KeyValuePair KVPair; @@ -150,9 +135,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, // epilogue operation lambda for final value calculation auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__( DataT acc[P::AccRowsPerTh][P::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, + DataT * regxn, DataT * regyn, IdxT gridStrideX, IdxT gridStrideY) { KVPReduceOpT pairRed_op(pairRedOp); @@ -181,105 +164,72 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { auto tmpkey = acccolid + j * P::AccThCols + gridStrideX; - KVPair tmp = {tmpkey, acc[i][j]}; + KVPair tmp = {tmpkey, acc[i][j]}; if (tmpkey < n) { - val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + val[i] = + pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } } } }; - auto rowEpilog_lambda = - [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) { - KVPReduceOpT pairRed_op(pairRedOp); - ReduceOpT red_op(redOp); + auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val, + maxVal] __device__(IdxT gridStrideY) { + KVPReduceOpT pairRed_op(pairRedOp); + ReduceOpT red_op(redOp); - const auto accrowid = threadIdx.x / P::AccThCols; - const auto lid = raft::laneId(); + const auto accrowid = threadIdx.x / P::AccThCols; + const auto lid = raft::laneId(); // reduce #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { + for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll - for (int j = P::AccThCols / 2; j > 0; j >>= 1) { - auto tmpkey = raft::shfl(val[i].key, lid + j); - auto tmpvalue = raft::shfl(val[i].value, lid + j); - KVPair tmp = {tmpkey, tmpvalue}; - val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); - } + for (int j = P::AccThCols / 2; j > 0; j >>= 1) { + auto tmpkey = raft::shfl(val[i].key, lid + j); + auto tmpvalue = raft::shfl(val[i].value, lid + j); + KVPair tmp = {tmpkey, tmpvalue}; + val[i] = + pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } + } - updateReducedVal(mutex, min, val, red_op, m, gridStrideY); + updateReducedVal(mutex, min, val, red_op, + m, gridStrideY); // reset the val array. #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; - } - }; + for (int i = 0; i < P::AccRowsPerTh; ++i) { + val[i] = {-1, maxVal}; + } + }; IdxT lda = k, ldb = k, ldd = n; - PairwiseDistances - obj(x, - y, - m, - n, - k, - lda, - ldb, - ldd, - xn, - yn, - nullptr, - smem, - core_op, - epilog_lambda, - fin_op, - rowEpilog_lambda); + PairwiseDistances + obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op, + epilog_lambda, fin_op, rowEpilog_lambda); obj.run(); } -template -void fusedL2NNImpl(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - int* workspace, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - bool sqrt, - bool initOutBuffer, - cudaStream_t stream) -{ +template +void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, + const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace, + ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, + bool initOutBuffer, cudaStream_t stream) { typedef typename linalg::Policy4x4::Policy P; dim3 blk(P::Nthreads); - auto nblks = raft::ceildiv(m, P::Nthreads); + auto nblks = raft::ceildiv(m, P::Nthreads); constexpr auto maxVal = std::numeric_limits::max(); typedef cub::KeyValuePair KVPair; // Accumulation operation lambda - auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; }; + auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { + acc += x * y; + }; CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); if (initOutBuffer) { @@ -290,34 +240,25 @@ void fusedL2NNImpl(OutT* min, auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; - constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); + constexpr size_t shmemSize = + P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); if (sqrt) { - auto fusedL2NNSqrt = fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); + auto fusedL2NNSqrt = + fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); fusedL2NNSqrt<<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, + core_lambda, fin_op); } else { - auto fusedL2NN = fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); - fusedL2NN<<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); + auto fusedL2NN = + fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); + fusedL2NN<<>>(min, x, y, xn, yn, m, n, k, + maxVal, workspace, redOp, + pairRedOp, core_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); @@ -358,32 +299,25 @@ void fusedL2NNImpl(OutT* min, * main kernel launch * @param[in] stream cuda stream */ -template -void fusedL2NN(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - void* workspace, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - bool sqrt, - bool initOutBuffer, - cudaStream_t stream) -{ +template +void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn, + const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace, + ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, + bool initOutBuffer, cudaStream_t stream) { size_t bytes = sizeof(DataT) * k; if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, + initOutBuffer, stream); } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, + initOutBuffer, stream); } else { fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, + initOutBuffer, stream); } } diff --git a/cpp/include/raft/distance/hellinger.cuh b/cpp/include/raft/distance/hellinger.cuh index c8c7dad7d4..f7ad3ed1ba 100644 --- a/cpp/include/raft/distance/hellinger.cuh +++ b/cpp/include/raft/distance/hellinger.cuh @@ -23,7 +23,7 @@ namespace distance { /** * @brief the Hellinger distance matrix using the expanded form: - * It computes the following equation: + * It computes the following equation: cij = sqrt(1 - sum(sqrt(x_k * y_k))) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -51,40 +51,29 @@ namespace distance { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void hellingerImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, + IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef typename std::conditional::type KPolicy; + typedef + typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); - auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); }; + auto unaryOp_lambda = [] __device__(DataT input) { + return raft::mySqrt(input); + }; // First sqrt x and y raft::linalg::unaryOp( - (DataT*)x, x, m * k, unaryOp_lambda, stream); + (DataT *)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT*)y, y, n * k, unaryOp_lambda, stream); + (DataT *)y, y, n * k, unaryOp_lambda, stream); } // Accumulation operation lambda @@ -95,91 +84,71 @@ static void hellingerImpl(const DataT* x, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - const auto finalVal = (1 - acc[i][j]); + const auto finalVal = (1 - acc[i][j]); const auto rectifier = (!signbit(finalVal)); - acc[i][j] = raft::mySqrt(rectifier * finalVal); + acc[i][j] = raft::mySqrt(rectifier * finalVal); } } }; if (isRowMajor) { - auto hellingerRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerRowMajor); + auto hellingerRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + hellingerRowMajor); hellingerRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } else { - auto hellingerColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerColMajor); + auto hellingerColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + hellingerColMajor); hellingerColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } // Revert sqrt of x and y raft::linalg::unaryOp( - (DataT*)x, x, m * k, unaryOp_lambda, stream); + (DataT *)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT*)y, y, n * k, unaryOp_lambda, stream); + (DataT *)y, y, n * k, unaryOp_lambda, stream); } CUDA_CHECK(cudaGetLastError()); } -template -void hellinger(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - hellingerImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - hellingerImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, + stream); } else { hellingerImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -188,7 +157,7 @@ void hellinger(IdxT m, /** * @brief the Hellinger distance matrix calculation - * It computes the following equation: + * It computes the following equation: sqrt(1 - sum(sqrt(x_k * y_k)) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -210,25 +179,16 @@ void hellinger(IdxT m, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void hellingerImpl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ +template +void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { typedef std::is_same is_bool; - typedef typename std::conditional::type hellingerOutType; + typedef typename std::conditional::type + hellingerOutType; Index_ lda, ldb, ldd; - hellingerOutType* pDcast = reinterpret_cast(pD); + hellingerOutType *pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; hellinger( diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh index 268e269391..6ab084f041 100644 --- a/cpp/include/raft/distance/l1.cuh +++ b/cpp/include/raft/distance/l1.cuh @@ -42,29 +42,16 @@ namespace distance { * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -static void l1Impl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, + IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream) { typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef typename std::conditional::type KPolicy; + typedef + typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -75,69 +62,47 @@ static void l1Impl(const DataT* x, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto l1RowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); + auto l1RowMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); l1RowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } else { - auto l1ColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); + auto l1ColMajor = + pairwiseDistanceMatKernel; + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); l1ColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void l1(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream) -{ +template +void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x, + const DataT *y, OutT *dOutput, FinalLambda fin_op, + cudaStream_t stream) { size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - l1Impl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + l1Impl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { l1Impl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -165,25 +130,16 @@ void l1(IdxT m, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void l1Impl(int m, - int n, - int k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor) -{ +template +void l1Impl(int m, int n, int k, const InType *pA, const InType *pB, + OutType *pD, FinalLambda fin_op, cudaStream_t stream, + bool isRowMajor) { typedef std::is_same is_bool; - typedef typename std::conditional::type L1OutType; + typedef + typename std::conditional::type L1OutType; Index_ lda, ldb, ldd; - L1OutType* pDcast = reinterpret_cast(pD); + L1OutType *pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; l1( diff --git a/cpp/include/raft/distance/minkowski.cuh b/cpp/include/raft/distance/minkowski.cuh index c021954f32..803f5fc78a 100644 --- a/cpp/include/raft/distance/minkowski.cuh +++ b/cpp/include/raft/distance/minkowski.cuh @@ -21,7 +21,7 @@ namespace raft { namespace distance { /** - * @brief the unexpanded Minkowski distance matrix calculation + * @brief the unexpanded Minkowski distance matrix calculation * It computes the following equation: cij = sum(|x - y|^p)^(1/p) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -44,30 +44,16 @@ namespace distance { * @param[in] stream cuda stream to launch work * @param[in] the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiUnExpImpl(const DataT* x, - const DataT* y, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream, - DataT p) -{ +template +void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, + IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream, DataT p) { typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef typename std::conditional::type KPolicy; + typedef + typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -78,11 +64,10 @@ void minkowskiUnExpImpl(const DataT* x, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [p] __device__( + AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, DataT * regyn, IdxT gridStrideX, + IdxT gridStrideY) { const auto one_over_p = 1.0f / p; #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -94,68 +79,48 @@ void minkowskiUnExpImpl(const DataT* x, }; if (isRowMajor) { - auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor); + auto minkowskiUnExpRowMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + minkowskiUnExpRowMajor); minkowskiUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } else { - auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor); + auto minkowskiUnExpColMajor = + pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, + minkowskiUnExpColMajor); minkowskiUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, + epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void minkowskiUnExp(IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - const DataT* x, - const DataT* y, - OutT* dOutput, - FinalLambda fin_op, - cudaStream_t stream, - DataT metric_arg) -{ +template +void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, + const DataT *x, const DataT *y, OutT *dOutput, + FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) { size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - minkowskiUnExpImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); + minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, + fin_op, stream, metric_arg); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - minkowskiUnExpImpl( - x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); + minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, + fin_op, stream, metric_arg); } else { minkowskiUnExpImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); @@ -181,25 +146,15 @@ void minkowskiUnExp(IdxT m, * @param[in] isRowMajor whether the input and output matrices are row major * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiImpl(Index_ m, - Index_ n, - Index_ k, - const InType* pA, - const InType* pB, - OutType* pD, - FinalLambda fin_op, - cudaStream_t stream, - bool isRowMajor, - InType metric_arg) -{ +template +void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA, + const InType *pB, OutType *pD, FinalLambda fin_op, + cudaStream_t stream, bool isRowMajor, InType metric_arg) { typedef std::is_same is_bool; - typedef typename std::conditional::type LpUnexpOutType; - LpUnexpOutType* pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type + LpUnexpOutType; + LpUnexpOutType *pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh index 3db4dc0131..43abc9eb65 100644 --- a/cpp/include/raft/distance/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/pairwise_distance_base.cuh @@ -31,11 +31,11 @@ namespace distance { * @tparam OutT output data-type (for C and D matrices) * @tparam IdxT index data-type * @tparam Policy struct which tunes the Contraction kernel - * @tparam CoreLambda tells how to accumulate an x and y into + * @tparam CoreLambda tells how to accumulate an x and y into acc. its signature: template void core_lambda(AccT& acc, const DataT& x, const DataT& y) - * @tparam EpilogueLambda applies an elementwise function to compute final + * @tparam EpilogueLambda applies an elementwise function to compute final values. Its signature is: template void epilogue_lambda (AccT acc[][], DataT* regxn, DataT* regyn); @@ -57,19 +57,13 @@ namespace distance { * @param fin_op the final gemm epilogue lambda */ -template > +template > struct PairwiseDistances : public BaseClass { private: typedef Policy P; @@ -87,21 +81,11 @@ struct PairwiseDistances : public BaseClass { public: // Constructor - DI PairwiseDistances(const DataT* _x, - const DataT* _y, - IdxT _m, - IdxT _n, - IdxT _k, - IdxT _lda, - IdxT _ldb, - IdxT _ldd, - const DataT* _xn, - const DataT* _yn, - OutT* _dOutput, - char* _smem, - CoreLambda _core_op, - EpilogueLambda _epilog_op, - FinalLambda _fin_op, + DI PairwiseDistances(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, + IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, + const DataT* _xn, const DataT* _yn, OutT* _dOutput, + char* _smem, CoreLambda _core_op, + EpilogueLambda _epilog_op, FinalLambda _fin_op, rowEpilogueLambda _rowEpilog_op) : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem), xn(_xn), @@ -112,12 +96,9 @@ struct PairwiseDistances : public BaseClass { core_op(_core_op), epilog_op(_epilog_op), fin_op(_fin_op), - rowEpilog_op(_rowEpilog_op) - { - } + rowEpilog_op(_rowEpilog_op) {} - DI void run() - { + DI void run() { for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m; gridStrideY += P::Mblk * gridDim.y) { for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n; @@ -131,8 +112,7 @@ struct PairwiseDistances : public BaseClass { } private: - DI void updateIndicesY() - { + DI void updateIndicesY() { const auto stride = P::Nblk * gridDim.x; if (isRowMajor) { this->y += stride * this->ldb; @@ -142,23 +122,21 @@ struct PairwiseDistances : public BaseClass { this->yrowid += stride; } - DI void updateIndicesXY() - { + DI void updateIndicesXY() { const auto stride = P::Mblk * gridDim.y; if (isRowMajor) { this->x += stride * this->lda; this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid; - this->y = yBase + this->yrowid * this->ldb; + this->y = yBase + this->yrowid * this->ldb; } else { this->x += stride; this->yrowid = IdxT(blockIdx.x) * P::Nblk; - this->y = yBase + this->yrowid + this->srowid * this->ldb; + this->y = yBase + this->yrowid + this->srowid * this->ldb; } this->xrowid += stride; } - DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) - { + DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) { // Fetch next grid stride ldg if within range if ((gridStrideX + gridDim.x * P::Nblk) < this->n) { updateIndicesY(); @@ -169,9 +147,10 @@ struct PairwiseDistances : public BaseClass { } } - DI void prolog(IdxT gridStrideX, IdxT gridStrideY) - { - if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); } + DI void prolog(IdxT gridStrideX, IdxT gridStrideY) { + if (gridStrideX == blockIdx.x * P::Nblk) { + this->ldgXY(0); + } #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { @@ -186,8 +165,7 @@ struct PairwiseDistances : public BaseClass { this->pageWr ^= 1; } - DI void loop() - { + DI void loop() { for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { this->ldgXY(kidx); accumulate(); // on the previous k-block @@ -204,8 +182,7 @@ struct PairwiseDistances : public BaseClass { this->pageRd ^= 1; } - DI void accumulate() - { + DI void accumulate() { #pragma unroll for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { this->ldsXY(ki); @@ -222,8 +199,7 @@ struct PairwiseDistances : public BaseClass { } } - DI void epilog(IdxT gridStrideX, IdxT gridStrideY) - { + DI void epilog(IdxT gridStrideX, IdxT gridStrideY) { if (useNorms) { DataT* sxNorm = (DataT*)(&smem[P::SmemSize]); DataT* syNorm = (&sxNorm[P::Mblk]); @@ -231,13 +207,13 @@ struct PairwiseDistances : public BaseClass { // Load x & y norms required by this threadblock in shmem buffer if (gridStrideX == blockIdx.x * P::Nblk) { for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = gridStrideY + i; + auto idx = gridStrideY + i; sxNorm[i] = idx < this->m ? xn[idx] : 0; } } for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = gridStrideX + i; + auto idx = gridStrideX + i; syNorm[i] = idx < this->n ? yn[idx] : 0; } @@ -312,67 +288,42 @@ struct PairwiseDistances : public BaseClass { * @param fin_op the final gemm epilogue lambda */ -template -__global__ __launch_bounds__(Policy::Nthreads, - 2) void pairwiseDistanceMatKernel(const DataT* x, - const DataT* y, - const DataT* _xn, - const DataT* _yn, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - CoreLambda core_op, - EpilogueLambda epilog_op, - FinalLambda fin_op) -{ +template +__global__ __launch_bounds__( + Policy::Nthreads, + 2) void pairwiseDistanceMatKernel(const DataT* x, const DataT* y, + const DataT* _xn, const DataT* _yn, IdxT m, + IdxT n, IdxT k, IdxT lda, IdxT ldb, + IdxT ldd, OutT* dOutput, CoreLambda core_op, + EpilogueLambda epilog_op, + FinalLambda fin_op) { extern __shared__ char smem[]; auto rowEpilog = [] __device__(IdxT starty) { return; }; - PairwiseDistances - obj( - x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog); + PairwiseDistances + obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, + epilog_op, fin_op, rowEpilog); obj.run(); } template -dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) -{ - const auto numSMs = raft::getMultiProcessorCount(); +dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) { + const auto numSMs = raft::getMultiProcessorCount(); int numBlocksPerSm = 0; dim3 grid; - CUDA_CHECK( - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize)); + CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlocksPerSm, func, P::Nthreads, sMemSize)); int minGridSize = numSMs * numBlocksPerSm; - int yChunks = raft::ceildiv(m, P::Mblk); - int xChunks = raft::ceildiv(n, P::Nblk); - grid.y = yChunks > minGridSize ? minGridSize : yChunks; - grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; + int yChunks = raft::ceildiv(m, P::Mblk); + int xChunks = raft::ceildiv(n, P::Nblk); + grid.y = yChunks > minGridSize ? minGridSize : yChunks; + grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; if (grid.x != 1) { int i = 1; while (grid.y * i < minGridSize) { diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index 773b83ab13..c62f2e5f79 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -31,14 +31,14 @@ class exception : public std::exception { explicit exception() noexcept : std::exception(), msg_() {} /** copy ctor */ - exception(exception const& src) noexcept : std::exception(), msg_(src.what()) - { + exception(exception const& src) noexcept + : std::exception(), msg_(src.what()) { collect_call_stack(); } /** ctor from an input message */ - explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg)) - { + explicit exception(std::string const msg) noexcept + : std::exception(), msg_(std::move(msg)) { collect_call_stack(); } @@ -51,8 +51,7 @@ class exception : public std::exception { /** append call stack info to this exception's message for ease of debug */ // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html - void collect_call_stack() noexcept - { + void collect_call_stack() noexcept { #ifdef __GNUC__ constexpr int kMaxStackDepth = 64; void* stack[kMaxStackDepth]; // NOLINT @@ -91,16 +90,16 @@ struct logic_error : public raft::exception { // FIXME: Need to be replaced with RAFT_FAIL /** macro to throw a runtime error */ -#define THROW(fmt, ...) \ - do { \ - std::string msg; \ - char errMsg[2048]; /* NOLINT */ \ - std::snprintf( \ - errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ - msg += errMsg; \ - std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ - msg += errMsg; \ - throw raft::exception(msg); \ +#define THROW(fmt, ...) \ + do { \ + std::string msg; \ + char errMsg[2048]; /* NOLINT */ \ + std::snprintf(errMsg, sizeof(errMsg), \ + "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ + msg += errMsg; \ + std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ + msg += errMsg; \ + throw raft::exception(msg); \ } while (0) // FIXME: Need to be replaced with RAFT_EXPECTS @@ -110,15 +109,16 @@ struct logic_error : public raft::exception { if (!(check)) THROW(fmt, ##__VA_ARGS__); \ } while (0) -#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ - do { \ - char err_msg[2048]; /* NOLINT */ \ - std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ - msg += err_msg; \ +#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ + do { \ + char err_msg[2048]; /* NOLINT */ \ + std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \ + __LINE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ + msg += err_msg; \ } while (0) /** diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index bb7d22e079..dbe7e83189 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -65,29 +65,29 @@ class handle_t { }()), streams_(n_streams), device_allocator_(std::make_shared()), - host_allocator_(std::make_shared()) - { + host_allocator_(std::make_shared()) { create_resources(); } /** - * @brief Construct a light handle copy from another + * @brief Construct a light handle copy from another * user stream, cuda handles, comms and worker pool are not copied - * The user_stream of the returned handle is set to the specified stream - * of the other handle worker pool - * @param[in] stream_id stream id in `other` worker streams + * The user_stream of the returned handle is set to the specified stream + * of the other handle worker pool + * @param[in] stream_id stream id in `other` worker streams * to be set as user stream in the constructed handle * @param[in] n_streams number worker streams to be created */ - handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams) - : dev_id_(other.get_device()), streams_(n_streams) - { - RAFT_EXPECTS(other.get_num_internal_streams() > 0, - "ERROR: the main handle must have at least one worker stream\n"); - prop_ = other.get_device_properties(); + handle_t(const handle_t& other, int stream_id, + int n_streams = kNumDefaultWorkerStreams) + : dev_id_(other.get_device()), streams_(n_streams) { + RAFT_EXPECTS( + other.get_num_internal_streams() > 0, + "ERROR: the main handle must have at least one worker stream\n"); + prop_ = other.get_device_properties(); device_prop_initialized_ = true; - device_allocator_ = other.get_device_allocator(); - host_allocator_ = other.get_host_allocator(); + device_allocator_ = other.get_device_allocator(); + host_allocator_ = other.get_host_allocator(); create_resources(); set_stream(other.get_internal_stream(stream_id)); } @@ -99,22 +99,25 @@ class handle_t { void set_stream(cudaStream_t stream) { user_stream_ = stream; } cudaStream_t get_stream() const { return user_stream_; } - rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); } + rmm::cuda_stream_view get_stream_view() const { + return rmm::cuda_stream_view(user_stream_); + } - void set_device_allocator(std::shared_ptr allocator) - { + void set_device_allocator(std::shared_ptr allocator) { device_allocator_ = allocator; } - std::shared_ptr get_device_allocator() const { return device_allocator_; } + std::shared_ptr get_device_allocator() const { + return device_allocator_; + } - void set_host_allocator(std::shared_ptr allocator) - { + void set_host_allocator(std::shared_ptr allocator) { host_allocator_ = allocator; } - std::shared_ptr get_host_allocator() const { return host_allocator_; } + std::shared_ptr get_host_allocator() const { + return host_allocator_; + } - cublasHandle_t get_cublas_handle() const - { + cublasHandle_t get_cublas_handle() const { std::lock_guard _(mutex_); if (!cublas_initialized_) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); @@ -123,8 +126,7 @@ class handle_t { return cublas_handle_; } - cusolverDnHandle_t get_cusolver_dn_handle() const - { + cusolverDnHandle_t get_cusolver_dn_handle() const { std::lock_guard _(mutex_); if (!cusolver_dn_initialized_) { CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_)); @@ -133,8 +135,7 @@ class handle_t { return cusolver_dn_handle_; } - cusolverSpHandle_t get_cusolver_sp_handle() const - { + cusolverSpHandle_t get_cusolver_sp_handle() const { std::lock_guard _(mutex_); if (!cusolver_sp_initialized_) { CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_)); @@ -143,8 +144,7 @@ class handle_t { return cusolver_sp_handle_; } - cusparseHandle_t get_cusparse_handle() const - { + cusparseHandle_t get_cusparse_handle() const { std::lock_guard _(mutex_); if (!cusparse_initialized_) { CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); @@ -154,13 +154,16 @@ class handle_t { } // legacy compatibility for cuML - cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); } + cudaStream_t get_internal_stream(int sid) const { + return streams_.get_stream(sid).value(); + } // new accessor return rmm::cuda_stream_view - rmm::cuda_stream_view get_internal_stream_view(int sid) const { return streams_.get_stream(sid); } + rmm::cuda_stream_view get_internal_stream_view(int sid) const { + return streams_.get_stream(sid); + } int get_num_internal_streams() const { return streams_.get_pool_size(); } - std::vector get_internal_streams() const - { + std::vector get_internal_streams() const { std::vector int_streams_vec; for (int i = 0; i < get_num_internal_streams(); i++) { int_streams_vec.push_back(get_internal_stream(i)); @@ -168,51 +171,49 @@ class handle_t { return int_streams_vec; } - void wait_on_user_stream() const - { + void wait_on_user_stream() const { CUDA_CHECK(cudaEventRecord(event_, user_stream_)); for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0)); } } - void wait_on_internal_streams() const - { + void wait_on_internal_streams() const { for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i))); CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0)); } } - void set_comms(std::shared_ptr communicator) { communicator_ = communicator; } + void set_comms(std::shared_ptr communicator) { + communicator_ = communicator; + } - const comms::comms_t& get_comms() const - { - RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n"); + const comms::comms_t& get_comms() const { + RAFT_EXPECTS(this->comms_initialized(), + "ERROR: Communicator was not initialized\n"); return *communicator_; } - void set_subcomm(std::string key, std::shared_ptr subcomm) - { + void set_subcomm(std::string key, std::shared_ptr subcomm) { subcomms_[key] = subcomm; } - const comms::comms_t& get_subcomm(std::string key) const - { - RAFT_EXPECTS( - subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str()); + const comms::comms_t& get_subcomm(std::string key) const { + RAFT_EXPECTS(subcomms_.find(key) != subcomms_.end(), + "%s was not found in subcommunicators.", key.c_str()); auto subcomm = subcomms_.at(key); - RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized"); + RAFT_EXPECTS(nullptr != subcomm.get(), + "ERROR: Subcommunicator was not initialized"); return *subcomm; } bool comms_initialized() const { return (nullptr != communicator_.get()); } - const cudaDeviceProp& get_device_properties() const - { + const cudaDeviceProp& get_device_properties() const { std::lock_guard _(mutex_); if (!device_prop_initialized_) { CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_)); @@ -243,28 +244,29 @@ class handle_t { mutable bool device_prop_initialized_{false}; mutable std::mutex mutex_; - void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); } + void create_resources() { + CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } - void destroy_resources() - { + void destroy_resources() { ///@todo: enable *_NO_THROW variants once we have enabled logging if (cusparse_initialized_) { - // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); + //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); } if (cusolver_dn_initialized_) { - // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); + //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_)); } if (cusolver_sp_initialized_) { - // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); + //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_)); } if (cublas_initialized_) { - // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); + //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); CUBLAS_CHECK(cublasDestroy(cublas_handle_)); } - // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); + //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); CUDA_CHECK(cudaEventDestroy(event_)); } }; // class handle_t @@ -274,8 +276,7 @@ class handle_t { */ class stream_syncer { public: - explicit stream_syncer(const handle_t& handle) : handle_(handle) - { + explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.wait_on_user_stream(); } ~stream_syncer() { handle_.wait_on_internal_streams(); } diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h index 5fc56de14b..a7cfb9287b 100644 --- a/cpp/include/raft/integer_utils.h +++ b/cpp/include/raft/integer_utils.h @@ -34,13 +34,15 @@ namespace raft { * `modulus` is positive. */ template -inline S round_up_safe(S number_to_round, S modulus) -{ +inline S round_up_safe(S number_to_round, S modulus) { auto remainder = number_to_round % modulus; - if (remainder == 0) { return number_to_round; } + if (remainder == 0) { + return number_to_round; + } auto rounded_up = number_to_round - remainder + modulus; if (rounded_up < number_to_round) { - throw std::invalid_argument("Attempt to round up beyond the type's maximum value"); + throw std::invalid_argument( + "Attempt to round up beyond the type's maximum value"); } return rounded_up; } @@ -51,9 +53,8 @@ inline S round_up_safe(S number_to_round, S modulus) * `modulus` is positive. */ template -inline S round_down_safe(S number_to_round, S modulus) -{ - auto remainder = number_to_round % modulus; +inline S round_down_safe(S number_to_round, S modulus) { + auto remainder = number_to_round % modulus; auto rounded_down = number_to_round - remainder; return rounded_down; } @@ -71,28 +72,25 @@ inline S round_down_safe(S number_to_round, S modulus) * the result will be incorrect */ template -constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept -{ +constexpr inline S div_rounding_up_unsafe(const S& dividend, + const T& divisor) noexcept { return (dividend + divisor - 1) / divisor; } namespace detail { template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, - I divisor) noexcept -{ + I dividend, I divisor) noexcept { // TODO: This could probably be implemented faster - return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) - : (dividend > 0); + return (dividend > divisor) + ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) + : (dividend > 0); } template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, - I divisor) noexcept -{ - auto quotient = dividend / divisor; + I dividend, I divisor) noexcept { + auto quotient = dividend / divisor; auto remainder = dividend % divisor; return quotient + (remainder != 0); } @@ -112,17 +110,16 @@ constexpr inline I div_rounding_up_safe(std::integral_constant, * approach of using (dividend + divisor - 1) / divisor */ template -constexpr inline std::enable_if_t::value, I> div_rounding_up_safe( - I dividend, I divisor) noexcept -{ - using i_is_a_signed_type = std::integral_constant::value>; +constexpr inline std::enable_if_t::value, I> +div_rounding_up_safe(I dividend, I divisor) noexcept { + using i_is_a_signed_type = + std::integral_constant::value>; return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); } template -constexpr inline std::enable_if_t::value, bool> is_a_power_of_two( - I val) noexcept -{ +constexpr inline std::enable_if_t::value, bool> +is_a_power_of_two(I val) noexcept { return ((val - 1) & val) == 0; } @@ -150,14 +147,14 @@ constexpr inline std::enable_if_t::value, bool> is_a_power_o * @return Absolute value if value type is signed. */ template -std::enable_if_t::value, T> constexpr inline absolute_value(T value) -{ +std::enable_if_t::value, T> constexpr inline absolute_value( + T value) { return std::abs(value); } // Unsigned type just returns itself. template -std::enable_if_t::value, T> constexpr inline absolute_value(T value) -{ +std::enable_if_t::value, T> constexpr inline absolute_value( + T value) { return value; } diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index 0bbfa2bb3c..0da7da2eb6 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -43,35 +43,33 @@ namespace label { * \param [in] allocator device allocator */ template -void getUniquelabels(value_t* y, - size_t n, - value_t** y_unique, - int* n_unique, +void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique, cudaStream_t stream, - std::shared_ptr allocator) -{ + std::shared_ptr allocator) { raft::mr::device::buffer y2(allocator, stream, n); raft::mr::device::buffer y3(allocator, stream, n); raft::mr::device::buffer d_num_selected(allocator, stream, 1); - size_t bytes = 0; + size_t bytes = 0; size_t bytes2 = 0; // Query how much temporary storage we will need for cub operations // and allocate it cub::DeviceRadixSort::SortKeys(NULL, bytes, y, y2.data(), n); - cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), d_num_selected.data(), n); + cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), + d_num_selected.data(), n); bytes = max(bytes, bytes2); raft::mr::device::buffer cub_storage(allocator, stream, bytes); // Select Unique classes cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n); - cub::DeviceSelect::Unique( - cub_storage.data(), bytes, y2.data(), y3.data(), d_num_selected.data(), n); + cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(), + d_num_selected.data(), n); raft::update_host(n_unique, d_num_selected.data(), 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // Copy unique classes to output - *y_unique = (value_t*)allocator->allocate(*n_unique * sizeof(value_t), stream); + *y_unique = + (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream); raft::copy(*y_unique, y3.data(), *n_unique, stream); } @@ -94,17 +92,16 @@ void getUniquelabels(value_t* y, * \param [in] stream cuda stream */ template -void getOvrlabels( - value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) -{ +void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes, + value_t *y_out, int idx, cudaStream_t stream) { ASSERT(idx < n_classes, "Parameter idx should not be larger than the number " "of classes"); raft::linalg::unaryOp( - y_out, - y, - n, - [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; }, + y_out, y, n, + [idx, y_unique] __device__(value_t y) { + return y == y_unique[idx] ? +1 : -1; + }, stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -113,14 +110,9 @@ void getOvrlabels( // +/-1, return array with the new class labels and corresponding indices. template -__global__ void map_label_kernel(Type* map_ids, - size_t N_labels, - Type* in, - Type* out, - size_t N, - Lambda filter_op, - bool zero_based = false) -{ +__global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, + Type *out, size_t N, Lambda filter_op, + bool zero_based = false) { int tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (!filter_op(in[tid])) { @@ -135,75 +127,68 @@ __global__ void map_label_kernel(Type* map_ids, } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out the output monotonic array - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - * @param filter_op an optional function for specifying which values - * should have monotonically increasing labels applied to them. - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out the output monotonic array + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + * @param filter_op an optional function for specifying which values + * should have monotonically increasing labels applied to them. + */ template -void make_monotonic(Type* out, - Type* in, - size_t N, - cudaStream_t stream, +void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, Lambda filter_op, std::shared_ptr allocator, - bool zero_based = false) -{ + bool zero_based = false) { static const size_t TPB_X = 256; dim3 blocks(raft::ceildiv(N, TPB_X)); dim3 threads(TPB_X); - Type* map_ids; + Type *map_ids; int num_clusters; getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator); - map_label_kernel - <<>>(map_ids, num_clusters, in, out, N, filter_op, zero_based); + map_label_kernel<<>>( + map_ids, num_clusters, in, out, N, filter_op, zero_based); allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream); } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out output label array with labels assigned monotonically - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out output label array with labels assigned monotonically + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + */ template -void make_monotonic(Type* out, - Type* in, - size_t N, - cudaStream_t stream, +void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, std::shared_ptr allocator, - bool zero_based = false) -{ + bool zero_based = false) { make_monotonic( - out, in, N, stream, [] __device__(Type val) { return false; }, allocator, zero_based); + out, in, N, stream, [] __device__(Type val) { return false; }, allocator, + zero_based); } }; // namespace label }; // end namespace raft diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh index 1ee0659b0d..bed74581a2 100644 --- a/cpp/include/raft/label/merge_labels.cuh +++ b/cpp/include/raft/label/merge_labels.cuh @@ -35,10 +35,8 @@ __global__ void __launch_bounds__(TPB_X) propagate_label_kernel(const value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, value_idx* __restrict__ R, - const bool* __restrict__ mask, - bool* __restrict__ m, - value_idx N) -{ + const bool* __restrict__ mask, bool* __restrict__ m, + value_idx N) { value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (__ldg((char*)mask + tid)) { @@ -67,17 +65,15 @@ template __global__ void __launch_bounds__(TPB_X) reassign_label_kernel(value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, - const value_idx* __restrict__ R, - value_idx N, - value_idx MAX_LABEL) -{ + const value_idx* __restrict__ R, value_idx N, + value_idx MAX_LABEL) { value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { // Note: labels are from 1 to N - value_idx la = labels_a[tid]; - value_idx lb = __ldg(labels_b + tid); - value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; - value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; + value_idx la = labels_a[tid]; + value_idx lb = __ldg(labels_b + tid); + value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; + value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; labels_a[tid] = min(ra, rb); } } @@ -112,14 +108,9 @@ __global__ void __launch_bounds__(TPB_X) * @param[in] stream CUDA stream */ template -void merge_labels(value_idx* labels_a, - const value_idx* labels_b, - const bool* mask, - value_idx* R, - bool* m, - value_idx N, - cudaStream_t stream) -{ +void merge_labels(value_idx* labels_a, const value_idx* labels_b, + const bool* mask, value_idx* R, bool* m, value_idx N, + cudaStream_t stream) { dim3 blocks(raft::ceildiv(N, value_idx(TPB_X))); dim3 threads(TPB_X); value_idx MAX_LABEL = std::numeric_limits::max(); diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/d_structs.h index e488dc528f..ed545b7198 100644 --- a/cpp/include/raft/lap/d_structs.h +++ b/cpp/include/raft/lap/d_structs.h @@ -26,18 +26,18 @@ template struct Vertices { - vertex_t* row_assignments; - vertex_t* col_assignments; - int* row_covers; - int* col_covers; - weight_t* row_duals; - weight_t* col_duals; - weight_t* col_slacks; + vertex_t *row_assignments; + vertex_t *col_assignments; + int *row_covers; + int *col_covers; + weight_t *row_duals; + weight_t *col_duals; + weight_t *col_slacks; }; template struct VertexData { - vertex_t* parents; - vertex_t* children; - int* is_visited; + vertex_t *parents; + vertex_t *children; + int *is_visited; }; diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index 64b6a31efb..6bc1c08029 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -38,12 +38,12 @@ class LinearAssignmentProblem { vertex_t batchsize_; weight_t epsilon_; - weight_t const* d_costs_; + weight_t const *d_costs_; Vertices d_vertices_dev; VertexData d_row_data_dev, d_col_data_dev; - raft::handle_t const& handle_; + raft::handle_t const &handle_; raft::mr::device::buffer row_covers_v; raft::mr::device::buffer col_covers_v; raft::mr::device::buffer row_duals_v; @@ -59,10 +59,8 @@ class LinearAssignmentProblem { raft::mr::device::buffer obj_val_dual_v; public: - LinearAssignmentProblem(raft::handle_t const& handle, - vertex_t size, - vertex_t batchsize, - weight_t epsilon) + LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size, + vertex_t batchsize, weight_t epsilon) : handle_(handle), size_(size), batchsize_(batchsize), @@ -80,13 +78,11 @@ class LinearAssignmentProblem { row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0), obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0), - obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) - { - } + obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {} // Executes Hungarian algorithm on the input cost matrix. - void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment) - { + void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment, + vertex_t *d_col_assignment) { initializeDevice(); d_vertices_dev.row_assignments = d_row_assignment; @@ -98,13 +94,27 @@ class LinearAssignmentProblem { while (step != 100) { switch (step) { - case 0: step = hungarianStep0(); break; - case 1: step = hungarianStep1(); break; - case 2: step = hungarianStep2(); break; - case 3: step = hungarianStep3(); break; - case 4: step = hungarianStep4(); break; - case 5: step = hungarianStep5(); break; - case 6: step = hungarianStep6(); break; + case 0: + step = hungarianStep0(); + break; + case 1: + step = hungarianStep1(); + break; + case 2: + step = hungarianStep2(); + break; + case 3: + step = hungarianStep3(); + break; + case 4: + step = hungarianStep4(); + break; + case 5: + step = hungarianStep5(); + break; + case 6: + step = hungarianStep6(); + break; } } @@ -112,39 +122,36 @@ class LinearAssignmentProblem { } // Function for getting optimal row dual vector for subproblem spId. - std::pair getRowDualVector(int spId) const - { + std::pair getRowDualVector(int spId) const { return std::make_pair(row_duals_v.data() + spId * size_, size_); } // Function for getting optimal col dual vector for subproblem spId. - std::pair getColDualVector(int spId) - { + std::pair getColDualVector(int spId) { return std::make_pair(col_duals_v.data() + spId * size_, size_); } // Function for getting optimal primal objective value for subproblem spId. - weight_t getPrimalObjectiveValue(int spId) - { + weight_t getPrimalObjectiveValue(int spId) { weight_t result; - raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream()); + raft::update_host(&result, obj_val_primal_v.data() + spId, 1, + handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } // Function for getting optimal dual objective value for subproblem spId. - weight_t getDualObjectiveValue(int spId) - { + weight_t getDualObjectiveValue(int spId) { weight_t result; - raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream()); + raft::update_host(&result, obj_val_dual_v.data() + spId, 1, + handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } private: // Helper function for initializing global variables and arrays on a single host. - void initializeDevice() - { + void initializeDevice() { row_covers_v.resize(batchsize_ * size_); col_covers_v.resize(batchsize_ * size_); row_duals_v.resize(batchsize_ * size_); @@ -162,36 +169,39 @@ class LinearAssignmentProblem { d_vertices_dev.row_covers = row_covers_v.data(); d_vertices_dev.col_covers = col_covers_v.data(); - d_vertices_dev.row_duals = row_duals_v.data(); - d_vertices_dev.col_duals = col_duals_v.data(); + d_vertices_dev.row_duals = row_duals_v.data(); + d_vertices_dev.col_duals = col_duals_v.data(); d_vertices_dev.col_slacks = col_slacks_v.data(); d_row_data_dev.is_visited = row_is_visited_v.data(); d_col_data_dev.is_visited = col_is_visited_v.data(); - d_row_data_dev.parents = row_parents_v.data(); - d_row_data_dev.children = row_children_v.data(); - d_col_data_dev.parents = col_parents_v.data(); - d_col_data_dev.children = col_children_v.data(); - - thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0}); - thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0}); - thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0}); - thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0}); + d_row_data_dev.parents = row_parents_v.data(); + d_row_data_dev.children = row_children_v.data(); + d_col_data_dev.parents = col_parents_v.data(); + d_col_data_dev.children = col_children_v.data(); + + thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), + int{0}); + thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), + int{0}); + thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), + weight_t{0}); + thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), + weight_t{0}); } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep0() - { - detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_); + int hungarianStep0() { + detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, + size_); return 1; } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep1() - { - detail::computeInitialAssignments( - handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_); + int hungarianStep1() { + detail::computeInitialAssignments(handle_, d_costs_, d_vertices_dev, + batchsize_, size_, epsilon_); int next = 2; @@ -207,10 +217,10 @@ class LinearAssignmentProblem { } // Function for checking optimality and constructing predicates and covers. - int hungarianStep2() - { - int cover_count = detail::computeRowCovers( - handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); + int hungarianStep2() { + int cover_count = + detail::computeRowCovers(handle_, d_vertices_dev, d_row_data_dev, + d_col_data_dev, batchsize_, size_); int next = (cover_count == batchsize_ * size_) ? 6 : 3; @@ -218,23 +228,17 @@ class LinearAssignmentProblem { } // Function for building alternating tree rooted at unassigned rows. - int hungarianStep3() - { + int hungarianStep3() { int next; - raft::mr::device::buffer flag_v(handle_.get_device_allocator(), handle_.get_stream(), 1); + raft::mr::device::buffer flag_v(handle_.get_device_allocator(), + handle_.get_stream(), 1); bool h_flag = false; raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream()); - detail::executeZeroCover(handle_, - d_costs_, - d_vertices_dev, - d_row_data_dev, - d_col_data_dev, - flag_v.data(), - batchsize_, - size_, + detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev, + d_col_data_dev, flag_v.data(), batchsize_, size_, epsilon_); raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream()); @@ -245,36 +249,31 @@ class LinearAssignmentProblem { } // Function for augmenting the solution along multiple node-disjoint alternating trees. - int hungarianStep4() - { - detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_); + int hungarianStep4() { + detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, + size_); - detail::augmentationPass( - handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); + detail::augmentationPass(handle_, d_vertices_dev, d_row_data_dev, + d_col_data_dev, batchsize_, size_); return 2; } // Function for updating dual solution to introduce new zero-cost arcs. - int hungarianStep5() - { - detail::dualUpdate( - handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_); + int hungarianStep5() { + detail::dualUpdate(handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, + batchsize_, size_, epsilon_); return 3; } // Function for calculating primal and dual objective values at optimality. - int hungarianStep6() - { - detail::calcObjValPrimal(handle_, - obj_val_primal_v.data(), - d_costs_, - d_vertices_dev.row_assignments, - batchsize_, - size_); + int hungarianStep6() { + detail::calcObjValPrimal(handle_, obj_val_primal_v.data(), d_costs_, + d_vertices_dev.row_assignments, batchsize_, size_); - detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_); + detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, + batchsize_, size_); return 100; } diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index 9bbd44bf09..0079f50e82 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -46,26 +46,20 @@ const int BLOCKDIMX{64}; const int BLOCKDIMY{1}; // Function for calculating grid and block dimensions from the given input size. -inline void calculateLinearDims(dim3& blocks_per_grid, - dim3& threads_per_block, - int& total_blocks, - int size) -{ +inline void calculateLinearDims(dim3 &blocks_per_grid, dim3 &threads_per_block, + int &total_blocks, int size) { threads_per_block.x = BLOCKDIMX * BLOCKDIMY; int value = size / threads_per_block.x; if (size % threads_per_block.x > 0) value++; - total_blocks = value; + total_blocks = value; blocks_per_grid.x = value; } // Function for calculating grid and block dimensions from the given input size for square grid. -inline void calculateSquareDims(dim3& blocks_per_grid, - dim3& threads_per_block, - int& total_blocks, - int size) -{ +inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block, + int &total_blocks, int size) { threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -74,16 +68,15 @@ inline void calculateSquareDims(dim3& blocks_per_grid, int valuex = (int)ceil((float)(sq_size) / BLOCKDIMX); int valuey = (int)ceil((float)(sq_size) / BLOCKDIMY); - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } -// Function for calculating grid and block dimensions from the given input size for rectangular -// grid. -inline void calculateRectangularDims( - dim3& blocks_per_grid, dim3& threads_per_block, int& total_blocks, int xsize, int ysize) -{ +// Function for calculating grid and block dimensions from the given input size for rectangular grid. +inline void calculateRectangularDims(dim3 &blocks_per_grid, + dim3 &threads_per_block, int &total_blocks, + int xsize, int ysize) { threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -93,18 +86,16 @@ inline void calculateRectangularDims( int valuey = ysize / threads_per_block.y; if (ysize % threads_per_block.y > 0) valuey++; - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } template -inline void initialReduction(raft::handle_t const& handle, - weight_t const* d_costs, - Vertices& d_vertices_dev, - int SP, - vertex_t N) -{ +inline void initialReduction(raft::handle_t const &handle, + weight_t const *d_costs, + Vertices &d_vertices_dev, + int SP, vertex_t N) { dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -112,38 +103,34 @@ inline void initialReduction(raft::handle_t const& handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_rowReduction<<>>( - d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits::max()); + kernel_rowReduction<<>>( + d_costs, d_vertices_dev.row_duals, SP, N, + std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); - kernel_columnReduction<<>>( - d_costs, - d_vertices_dev.row_duals, - d_vertices_dev.col_duals, - SP, - N, + kernel_columnReduction<<>>( + d_costs, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); } template -inline void computeInitialAssignments(raft::handle_t const& handle, - weight_t const* d_costs, - Vertices& d_vertices, - int SP, - vertex_t N, - weight_t epsilon) -{ +inline void computeInitialAssignments(raft::handle_t const &handle, + weight_t const *d_costs, + Vertices &d_vertices, + int SP, vertex_t N, weight_t epsilon) { dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; std::size_t size = SP * N; - raft::mr::device::buffer row_lock_v( - handle.get_device_allocator(), handle.get_stream(), size); - raft::mr::device::buffer col_lock_v( - handle.get_device_allocator(), handle.get_stream(), size); + raft::mr::device::buffer row_lock_v(handle.get_device_allocator(), + handle.get_stream(), size); + raft::mr::device::buffer col_lock_v(handle.get_device_allocator(), + handle.get_stream(), size); thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1); thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1); @@ -153,29 +140,21 @@ inline void computeInitialAssignments(raft::handle_t const& handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeInitialAssignments<<>>( - d_costs, - d_vertices.row_duals, - d_vertices.col_duals, - d_vertices.row_assignments, - d_vertices.col_assignments, - row_lock_v.data(), - col_lock_v.data(), - SP, - N, - epsilon); + kernel_computeInitialAssignments<<>>( + d_costs, d_vertices.row_duals, d_vertices.col_duals, + d_vertices.row_assignments, d_vertices.col_assignments, row_lock_v.data(), + col_lock_v.data(), SP, N, epsilon); CHECK_CUDA(handle.get_stream()); } // Function for finding row cover on individual devices. template -inline int computeRowCovers(raft::handle_t const& handle, - Vertices& d_vertices, - VertexData& d_row_data, - VertexData& d_col_data, - int SP, - vertex_t N) -{ +inline int computeRowCovers(raft::handle_t const &handle, + Vertices &d_vertices, + VertexData &d_row_data, + VertexData &d_col_data, int SP, + vertex_t N) { dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -184,7 +163,8 @@ inline int computeRowCovers(raft::handle_t const& handle, thrust::fill_n(thrust::device, d_vertices.row_covers, size, int{0}); thrust::fill_n(thrust::device, d_vertices.col_covers, size, int{0}); - thrust::fill_n(thrust::device, d_vertices.col_slacks, size, std::numeric_limits::max()); + thrust::fill_n(thrust::device, d_vertices.col_slacks, size, + std::numeric_limits::max()); thrust::fill_n(thrust::device, d_row_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_col_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_row_data.parents, size, vertex_t{-1}); @@ -194,28 +174,25 @@ inline int computeRowCovers(raft::handle_t const& handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeRowCovers<<>>( - d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N); + kernel_computeRowCovers<<>>( + d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, + SP, N); CHECK_CUDA(handle.get_stream()); - return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size); + return thrust::reduce(thrust::device, d_vertices.row_covers, + d_vertices.row_covers + size); } // Function for covering the zeros in uncovered rows and expanding the frontier. template -inline void coverZeroAndExpand(raft::handle_t const& handle, - weight_t const* d_costs_dev, - vertex_t const* d_rows_csr_neighbors, - vertex_t const* d_rows_csr_ptrs, - Vertices& d_vertices_dev, - VertexData& d_row_data_dev, - VertexData& d_col_data_dev, - bool* d_flag, - int SP, - vertex_t N, - weight_t epsilon) -{ +inline void coverZeroAndExpand( + raft::handle_t const &handle, weight_t const *d_costs_dev, + vertex_t const *d_rows_csr_neighbors, vertex_t const *d_rows_csr_ptrs, + Vertices &d_vertices_dev, + VertexData &d_row_data_dev, VertexData &d_col_data_dev, + bool *d_flag, int SP, vertex_t N, weight_t epsilon) { int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; @@ -223,34 +200,24 @@ inline void coverZeroAndExpand(raft::handle_t const& handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_coverAndExpand<<>>( - d_flag, - d_rows_csr_ptrs, - d_rows_csr_neighbors, - d_costs_dev, - d_vertices_dev, - d_row_data_dev, - d_col_data_dev, - SP, - N, - epsilon); + kernel_coverAndExpand<<>>( + d_flag, d_rows_csr_ptrs, d_rows_csr_neighbors, d_costs_dev, d_vertices_dev, + d_row_data_dev, d_col_data_dev, SP, N, epsilon); } template -inline vertex_t zeroCoverIteration(raft::handle_t const& handle, - weight_t const* d_costs_dev, - Vertices& d_vertices_dev, - VertexData& d_row_data_dev, - VertexData& d_col_data_dev, - bool* d_flag, - int SP, - vertex_t N, - weight_t epsilon) -{ +inline vertex_t zeroCoverIteration(raft::handle_t const &handle, + weight_t const *d_costs_dev, + Vertices &d_vertices_dev, + VertexData &d_row_data_dev, + VertexData &d_col_data_dev, + bool *d_flag, int SP, vertex_t N, + weight_t epsilon) { vertex_t M; - raft::mr::device::buffer csr_ptrs_v( - handle.get_device_allocator(), handle.get_stream(), 0); + raft::mr::device::buffer csr_ptrs_v(handle.get_device_allocator(), + handle.get_stream(), 0); raft::mr::device::buffer csr_neighbors_v( handle.get_device_allocator(), handle.get_stream(), 0); @@ -259,8 +226,8 @@ inline vertex_t zeroCoverIteration(raft::handle_t const& handle, dim3 threads_per_block; int total_blocks = 0; - raft::mr::device::buffer predicates_v( - handle.get_device_allocator(), handle.get_stream(), SP * N); + raft::mr::device::buffer predicates_v(handle.get_device_allocator(), + handle.get_stream(), SP * N); raft::mr::device::buffer addresses_v( handle.get_device_allocator(), handle.get_stream(), SP * N); @@ -275,108 +242,87 @@ inline vertex_t zeroCoverIteration(raft::handle_t const& handle, blocks_per_grid, threads_per_block, total_blocks, N, SP); // construct predicate matrix for edges. - kernel_rowPredicateConstructionCSR<<>>( - predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N); + predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, + N); CHECK_CUDA(handle.get_stream()); M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); - thrust::exclusive_scan( - thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); + thrust::exclusive_scan(thrust::device, addresses_v.begin(), + addresses_v.end(), addresses_v.begin()); if (M > 0) { csr_neighbors_v.resize(M); - kernel_rowScatterCSR<<>>( - predicates_v.data(), - addresses_v.data(), - csr_neighbors_v.data(), - csr_ptrs_v.data(), - M, - SP, - N); + kernel_rowScatterCSR<<>>( + predicates_v.data(), addresses_v.data(), csr_neighbors_v.data(), + csr_ptrs_v.data(), M, SP, N); CHECK_CUDA(handle.get_stream()); } } if (M > 0) { - coverZeroAndExpand(handle, - d_costs_dev, - csr_neighbors_v.data(), - csr_ptrs_v.data(), - d_vertices_dev, - d_row_data_dev, - d_col_data_dev, - d_flag, - SP, - N, - epsilon); + coverZeroAndExpand(handle, d_costs_dev, csr_neighbors_v.data(), + csr_ptrs_v.data(), d_vertices_dev, d_row_data_dev, + d_col_data_dev, d_flag, SP, N, epsilon); } return M; } -// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending -// on the presence of uncovered zeros. +// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending on the presence of uncovered zeros. template -inline void executeZeroCover(raft::handle_t const& handle, - weight_t const* d_costs_dev, - Vertices& d_vertices_dev, - VertexData& d_row_data_dev, - VertexData& d_col_data_dev, - bool* d_flag, - int SP, - vertex_t N, - weight_t epsilon) -{ +inline void executeZeroCover(raft::handle_t const &handle, + weight_t const *d_costs_dev, + Vertices &d_vertices_dev, + VertexData &d_row_data_dev, + VertexData &d_col_data_dev, bool *d_flag, + int SP, vertex_t N, weight_t epsilon) { vertex_t M = 1; while (M > 0) { - M = zeroCoverIteration( - handle, d_costs_dev, d_vertices_dev, d_row_data_dev, d_col_data_dev, d_flag, SP, N, epsilon); + M = zeroCoverIteration(handle, d_costs_dev, d_vertices_dev, d_row_data_dev, + d_col_data_dev, d_flag, SP, N, epsilon); } } // Function for executing reverse pass of the maximum matching. template -inline void reversePass(raft::handle_t const& handle, - VertexData& d_row_data_dev, - VertexData& d_col_data_dev, - int SP, - int N) -{ +inline void reversePass(raft::handle_t const &handle, + VertexData &d_row_data_dev, + VertexData &d_col_data_dev, int SP, int N) { int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; std::size_t size = SP * N; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, + total_blocks, size); - raft::mr::device::buffer predicates_v( - handle.get_device_allocator(), handle.get_stream(), size); - raft::mr::device::buffer addresses_v( - handle.get_device_allocator(), handle.get_stream(), size); + raft::mr::device::buffer predicates_v(handle.get_device_allocator(), + handle.get_stream(), size); + raft::mr::device::buffer addresses_v(handle.get_device_allocator(), + handle.get_stream(), size); thrust::fill_n(thrust::device, predicates_v.data(), size, false); thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size); CHECK_CUDA(handle.get_stream()); // calculate total number of vertices. - std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); + std::size_t csr_size = + thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan( - thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); + thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), + addresses_v.begin()); if (csr_size > 0) { int total_blocks_1 = 0; @@ -388,12 +334,14 @@ inline void reversePass(raft::handle_t const& handle, raft::mr::device::buffer elements_v( handle.get_device_allocator(), handle.get_stream(), csr_size); - kernel_augmentScatter<<>>( + kernel_augmentScatter<<>>( elements_v.data(), predicates_v.data(), addresses_v.data(), size); CHECK_CUDA(handle.get_stream()); - kernel_reverseTraversal<<>>( + kernel_reverseTraversal<<>>( elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size); CHECK_CUDA(handle.get_stream()); } @@ -401,30 +349,27 @@ inline void reversePass(raft::handle_t const& handle, // Function for executing augmentation pass of the maximum matching. template -inline void augmentationPass(raft::handle_t const& handle, - Vertices& d_vertices_dev, - VertexData& d_row_data_dev, - VertexData& d_col_data_dev, - int SP, - int N) -{ +inline void augmentationPass(raft::handle_t const &handle, + Vertices &d_vertices_dev, + VertexData &d_row_data_dev, + VertexData &d_col_data_dev, int SP, + int N) { int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, + total_blocks, SP * N); - raft::mr::device::buffer predicates_v( - handle.get_device_allocator(), handle.get_stream(), SP * N); - raft::mr::device::buffer addresses_v( - handle.get_device_allocator(), handle.get_stream(), SP * N); + raft::mr::device::buffer predicates_v(handle.get_device_allocator(), + handle.get_stream(), SP * N); + raft::mr::device::buffer addresses_v(handle.get_device_allocator(), + handle.get_stream(), SP * N); thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false); thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N); @@ -435,8 +380,8 @@ inline void augmentationPass(raft::handle_t const& handle, vertex_t row_ids_csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan( - thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); + thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), + addresses_v.begin()); if (row_ids_csr_size > 0) { int total_blocks_1 = 0; @@ -448,18 +393,17 @@ inline void augmentationPass(raft::handle_t const& handle, raft::mr::device::buffer elements_v( handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size); - kernel_augmentScatter<<>>( - elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N}); + kernel_augmentScatter<<>>( + elements_v.data(), predicates_v.data(), addresses_v.data(), + vertex_t{SP * N}); CHECK_CUDA(handle.get_stream()); - kernel_augmentation<<>>( - d_vertices_dev.row_assignments, - d_vertices_dev.col_assignments, - elements_v.data(), - d_row_data_dev, - d_col_data_dev, - vertex_t{N}, + kernel_augmentation<<>>( + d_vertices_dev.row_assignments, d_vertices_dev.col_assignments, + elements_v.data(), d_row_data_dev, d_col_data_dev, vertex_t{N}, row_ids_csr_size); CHECK_CUDA(handle.get_stream()); @@ -467,46 +411,35 @@ inline void augmentationPass(raft::handle_t const& handle, } template -inline void dualUpdate(raft::handle_t const& handle, - Vertices& d_vertices_dev, - VertexData& d_row_data_dev, - VertexData& d_col_data_dev, - int SP, - vertex_t N, - weight_t epsilon) -{ +inline void dualUpdate(raft::handle_t const &handle, + Vertices &d_vertices_dev, + VertexData &d_row_data_dev, + VertexData &d_col_data_dev, int SP, vertex_t N, + weight_t epsilon) { dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks; - raft::mr::device::buffer sp_min_v( - handle.get_device_allocator(), handle.get_stream(), 1); + raft::mr::device::buffer sp_min_v(handle.get_device_allocator(), + handle.get_stream(), 1); - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); - kernel_dualUpdate_1<<>>( - sp_min_v.data(), - d_vertices_dev.col_slacks, - d_vertices_dev.col_covers, - SP, - N, - std::numeric_limits::max()); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, + total_blocks, SP); + kernel_dualUpdate_1<<>>( + sp_min_v.data(), d_vertices_dev.col_slacks, d_vertices_dev.col_covers, SP, + N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_dualUpdate_2<<>>( - sp_min_v.data(), - d_vertices_dev.row_duals, - d_vertices_dev.col_duals, - d_vertices_dev.col_slacks, - d_vertices_dev.row_covers, - d_vertices_dev.col_covers, - d_row_data_dev.is_visited, - d_col_data_dev.parents, - SP, - N, - std::numeric_limits::max(), + kernel_dualUpdate_2<<>>( + sp_min_v.data(), d_vertices_dev.row_duals, d_vertices_dev.col_duals, + d_vertices_dev.col_slacks, d_vertices_dev.row_covers, + d_vertices_dev.col_covers, d_row_data_dev.is_visited, + d_col_data_dev.parents, SP, N, std::numeric_limits::max(), epsilon); CHECK_CUDA(handle.get_stream()); @@ -514,19 +447,18 @@ inline void dualUpdate(raft::handle_t const& handle, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValDual(raft::handle_t const& handle, - weight_t* d_obj_val, - Vertices& d_vertices_dev, - int SP, - int N) -{ +inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val, + Vertices &d_vertices_dev, int SP, + int N) { dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, + total_blocks, SP); - kernel_calcObjValDual<<>>( + kernel_calcObjValDual<<>>( d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N); CHECK_CUDA(handle.get_stream()); @@ -534,21 +466,20 @@ inline void calcObjValDual(raft::handle_t const& handle, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValPrimal(raft::handle_t const& handle, - weight_t* d_obj_val, - weight_t const* d_costs, - vertex_t const* d_row_assignments, - int SP, - vertex_t N) -{ +inline void calcObjValPrimal(raft::handle_t const &handle, weight_t *d_obj_val, + weight_t const *d_costs, + vertex_t const *d_row_assignments, int SP, + vertex_t N) { dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, + total_blocks, SP); - kernel_calcObjValPrimal<<>>( - d_obj_val, d_costs, d_row_assignments, SP, N); + kernel_calcObjValPrimal<<>>(d_obj_val, d_costs, + d_row_assignments, SP, N); CHECK_CUDA(handle.get_stream()); } diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh index 45ad23afd1..8c9012ed72 100644 --- a/cpp/include/raft/lap/lap_kernels.cuh +++ b/cpp/include/raft/lap/lap_kernels.cuh @@ -48,57 +48,42 @@ const int AUGMENT{4}; const int MODIFIED{5}; template -bool __device__ near_zero(weight_t w, weight_t epsilon) -{ +bool __device__ near_zero(weight_t w, weight_t epsilon) { return ((w > -epsilon) && (w < epsilon)); } template <> -bool __device__ near_zero(int32_t w, int32_t epsilon) -{ +bool __device__ near_zero(int32_t w, int32_t epsilon) { return (w == 0); } template <> -bool __device__ near_zero(int64_t w, int64_t epsilon) -{ +bool __device__ near_zero(int64_t w, int64_t epsilon) { return (w == 0); } -// Device function for traversing the neighbors from start pointer to end pointer and updating the -// covers. The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of -// Step 4 execution. +// Device function for traversing the neighbors from start pointer to end pointer and updating the covers. +// The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of Step 4 execution. template -__device__ void cover_and_expand_row(weight_t const* d_elements, - weight_t const* d_row_duals, - weight_t const* d_col_duals, - weight_t* d_col_slacks, - int* d_row_covers, - int* d_col_covers, - vertex_t const* d_col_assignments, - bool* d_flag, - vertex_t* d_row_parents, - vertex_t* d_col_parents, - int* d_row_visited, - int* d_col_visited, - vertex_t rowid, - int spid, - int colid, - vertex_t N, - weight_t epsilon) -{ +__device__ void cover_and_expand_row( + weight_t const *d_elements, weight_t const *d_row_duals, + weight_t const *d_col_duals, weight_t *d_col_slacks, int *d_row_covers, + int *d_col_covers, vertex_t const *d_col_assignments, bool *d_flag, + vertex_t *d_row_parents, vertex_t *d_col_parents, int *d_row_visited, + int *d_col_visited, vertex_t rowid, int spid, int colid, vertex_t N, + weight_t epsilon) { int ROWID = spid * N + rowid; int COLID = spid * N + colid; - weight_t slack = - d_elements[spid * N * N + rowid * N + colid] - d_row_duals[ROWID] - d_col_duals[COLID]; + weight_t slack = d_elements[spid * N * N + rowid * N + colid] - + d_row_duals[ROWID] - d_col_duals[COLID]; int nxt_rowid = d_col_assignments[COLID]; int NXT_ROWID = spid * N + nxt_rowid; if (rowid != nxt_rowid && d_col_covers[COLID] == 0) { if (slack < d_col_slacks[COLID]) { - d_col_slacks[COLID] = slack; + d_col_slacks[COLID] = slack; d_col_parents[COLID] = ROWID; } @@ -107,12 +92,13 @@ __device__ void cover_and_expand_row(weight_t const* d_elements, d_row_parents[NXT_ROWID] = COLID; // update parent info d_row_covers[NXT_ROWID] = 0; - d_col_covers[COLID] = 1; + d_col_covers[COLID] = 1; - if (d_row_visited[NXT_ROWID] != VISITED) d_row_visited[NXT_ROWID] = ACTIVE; + if (d_row_visited[NXT_ROWID] != VISITED) + d_row_visited[NXT_ROWID] = ACTIVE; } else { d_col_visited[COLID] = REVERSE; - *d_flag = true; + *d_flag = true; } } } @@ -121,34 +107,28 @@ __device__ void cover_and_expand_row(weight_t const* d_elements, // Device function for traversing an alternating path from unassigned row to unassigned column. template -__device__ void __reverse_traversal(int* d_row_visited, - vertex_t* d_row_children, - vertex_t* d_col_children, - vertex_t const* d_row_parents, - vertex_t const* d_col_parents, - int cur_colid) -{ +__device__ void __reverse_traversal( + int *d_row_visited, vertex_t *d_row_children, vertex_t *d_col_children, + vertex_t const *d_row_parents, vertex_t const *d_col_parents, int cur_colid) { int cur_rowid = -1; while (cur_colid != -1) { d_col_children[cur_colid] = cur_rowid; - cur_rowid = d_col_parents[cur_colid]; + cur_rowid = d_col_parents[cur_colid]; d_row_children[cur_rowid] = cur_colid; - cur_colid = d_row_parents[cur_rowid]; + cur_colid = d_row_parents[cur_rowid]; } d_row_visited[cur_rowid] = AUGMENT; } // Device function for augmenting the alternating path from unassigned column to unassigned row. template -__device__ void __augment(vertex_t* d_row_assignments, - vertex_t* d_col_assignments, - vertex_t const* d_row_children, - vertex_t const* d_col_children, - vertex_t cur_rowid, - vertex_t N) -{ +__device__ void __augment(vertex_t *d_row_assignments, + vertex_t *d_col_assignments, + vertex_t const *d_row_children, + vertex_t const *d_col_children, vertex_t cur_rowid, + vertex_t N) { int cur_colid = -1; while (cur_rowid != -1) { @@ -165,18 +145,20 @@ __device__ void __augment(vertex_t* d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_rowReduction( - weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity) -{ - int spid = blockIdx.y * blockDim.y + threadIdx.y; - int rowid = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void kernel_rowReduction(weight_t const *d_costs, + weight_t *d_row_duals, int SP, vertex_t N, + weight_t infinity) { + int spid = blockIdx.y * blockDim.y + threadIdx.y; + int rowid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && rowid < N) { for (int colid = 0; colid < N; colid++) { weight_t slack = d_costs[spid * N * N + rowid * N + colid]; - if (slack < min) { min = slack; } + if (slack < min) { + min = slack; + } } d_row_duals[spid * N + rowid] = min; @@ -187,26 +169,25 @@ __global__ void kernel_rowReduction( // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_columnReduction(weight_t const* d_costs, - weight_t const* d_row_duals, - weight_t* d_col_duals, - int SP, - vertex_t N, - weight_t infinity) -{ - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_columnReduction(weight_t const *d_costs, + weight_t const *d_row_duals, + weight_t *d_col_duals, int SP, + vertex_t N, weight_t infinity) { + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && colid < N) { for (int rowid = 0; rowid < N; rowid++) { - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[spid * N + rowid]; weight_t slack = cost - row_dual; - if (slack < min) { min = slack; } + if (slack < min) { + min = slack; + } } d_col_duals[spid * N + colid] = min; @@ -215,18 +196,12 @@ __global__ void kernel_columnReduction(weight_t const* d_costs, // Kernel for calculating initial assignments. template -__global__ void kernel_computeInitialAssignments(weight_t const* d_costs, - weight_t const* d_row_duals, - weight_t const* d_col_duals, - vertex_t* d_row_assignments, - vertex_t* d_col_assignments, - int* d_row_lock, - int* d_col_lock, - int SP, - vertex_t N, - weight_t epsilon) -{ - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeInitialAssignments( + weight_t const *d_costs, weight_t const *d_row_duals, + weight_t const *d_col_duals, vertex_t *d_row_assignments, + vertex_t *d_col_assignments, int *d_row_lock, int *d_col_lock, int SP, + vertex_t N, weight_t epsilon) { + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && colid < N) { @@ -238,15 +213,15 @@ __global__ void kernel_computeInitialAssignments(weight_t const* d_costs, if (d_col_lock[overall_colid] == 1) break; - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[overall_rowid]; - weight_t slack = cost - row_dual - col_dual; + weight_t slack = cost - row_dual - col_dual; if (near_zero(slack, epsilon)) { if (atomicCAS(&d_row_lock[overall_rowid], 0, 1) == 0) { d_row_assignments[overall_rowid] = colid; d_col_assignments[overall_colid] = rowid; - d_col_lock[overall_colid] = 1; + d_col_lock[overall_colid] = 1; } } } @@ -255,10 +230,10 @@ __global__ void kernel_computeInitialAssignments(weight_t const* d_costs, // Kernel for populating the cover arrays and initializing alternating tree. template -__global__ void kernel_computeRowCovers( - vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N) -{ - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeRowCovers(vertex_t *d_row_assignments, + int *d_row_covers, int *d_row_visited, + int SP, vertex_t N) { + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -274,10 +249,11 @@ __global__ void kernel_computeRowCovers( // Kernel for populating the predicate matrix for edges in row major format. template -__global__ void kernel_rowPredicateConstructionCSR( - bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N) -{ - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates, + vertex_t *d_addresses, + int *d_row_visited, int SP, + vertex_t N) { + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -285,160 +261,130 @@ __global__ void kernel_rowPredicateConstructionCSR( if (d_row_visited[index] == ACTIVE) { d_predicates[index] = true; - d_addresses[index] = 1; + d_addresses[index] = 1; } else { d_predicates[index] = false; - d_addresses[index] = 0; + d_addresses[index] = 0; } } } // Kernel for scattering the edges based on the scatter addresses. template -__global__ void kernel_rowScatterCSR(bool const* d_predicates, - vertex_t const* d_addresses, - vertex_t* d_neighbors, - vertex_t* d_ptrs, - vertex_t M, - int SP, - vertex_t N) -{ - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowScatterCSR(bool const *d_predicates, + vertex_t const *d_addresses, + vertex_t *d_neighbors, vertex_t *d_ptrs, + vertex_t M, int SP, vertex_t N) { + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { int index = spid * N + rowid; - bool predicate = d_predicates[index]; + bool predicate = d_predicates[index]; vertex_t compid = d_addresses[index]; - if (predicate) { d_neighbors[compid] = rowid; } + if (predicate) { + d_neighbors[compid] = rowid; + } if (rowid == 0) { d_ptrs[spid] = compid; - d_ptrs[SP] = M; + d_ptrs[SP] = M; } } } // Kernel for finding the minimum zero cover. template -__global__ void kernel_coverAndExpand(bool* d_flag, - vertex_t const* d_ptrs, - vertex_t const* d_neighbors, - weight_t const* d_elements, +__global__ void kernel_coverAndExpand(bool *d_flag, vertex_t const *d_ptrs, + vertex_t const *d_neighbors, + weight_t const *d_elements, Vertices d_vertices, VertexData d_row_data, - VertexData d_col_data, - int SP, - vertex_t N, - weight_t epsilon) -{ - int spid = blockIdx.y * blockDim.y + threadIdx.y; + VertexData d_col_data, int SP, + vertex_t N, weight_t epsilon) { + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; // Load values into local memory if (spid < SP && colid < N) { thrust::for_each( - thrust::seq, - d_neighbors + d_ptrs[spid], - d_neighbors + d_ptrs[spid + 1], - [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, epsilon] __device__( - vertex_t rowid) { - cover_and_expand_row(d_elements, - d_vertices.row_duals, - d_vertices.col_duals, - d_vertices.col_slacks, - d_vertices.row_covers, - d_vertices.col_covers, - d_vertices.col_assignments, - d_flag, - d_row_data.parents, - d_col_data.parents, - d_row_data.is_visited, - d_col_data.is_visited, - rowid, - spid, - colid, - N, - epsilon); + thrust::seq, d_neighbors + d_ptrs[spid], d_neighbors + d_ptrs[spid + 1], + [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, + epsilon] __device__(vertex_t rowid) { + cover_and_expand_row( + d_elements, d_vertices.row_duals, d_vertices.col_duals, + d_vertices.col_slacks, d_vertices.row_covers, d_vertices.col_covers, + d_vertices.col_assignments, d_flag, d_row_data.parents, + d_col_data.parents, d_row_data.is_visited, d_col_data.is_visited, + rowid, spid, colid, N, epsilon); }); } } // Kernel for constructing the predicates for reverse pass or augmentation candidates. template -__global__ void kernel_augmentPredicateConstruction(bool* d_predicates, - vertex_t* d_addresses, - int* d_visited, - int size) -{ +__global__ void kernel_augmentPredicateConstruction(bool *d_predicates, + vertex_t *d_addresses, + int *d_visited, int size) { int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { int visited = d_visited[id]; if ((visited == REVERSE) || (visited == AUGMENT)) { d_predicates[id] = true; - d_addresses[id] = 1; + d_addresses[id] = 1; } else { d_predicates[id] = false; - d_addresses[id] = 0; + d_addresses[id] = 0; } } } // Kernel for scattering the vertices based on the scatter addresses. template -__global__ void kernel_augmentScatter(vertex_t* d_elements, - bool const* d_predicates, - vertex_t const* d_addresses, - std::size_t size) -{ +__global__ void kernel_augmentScatter(vertex_t *d_elements, + bool const *d_predicates, + vertex_t const *d_addresses, + std::size_t size) { int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - if (d_predicates[id]) { d_elements[d_addresses[id]] = id; } + if (d_predicates[id]) { + d_elements[d_addresses[id]] = id; + } } } // Kernel for executing the reverse pass of the maximum matching algorithm. template -__global__ void kernel_reverseTraversal(vertex_t* d_elements, +__global__ void kernel_reverseTraversal(vertex_t *d_elements, VertexData d_row_data, VertexData d_col_data, - int size) -{ + int size) { int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __reverse_traversal(d_row_data.is_visited, - d_row_data.children, - d_col_data.children, - d_row_data.parents, - d_col_data.parents, - d_elements[id]); + __reverse_traversal(d_row_data.is_visited, d_row_data.children, + d_col_data.children, d_row_data.parents, + d_col_data.parents, d_elements[id]); } } // Kernel for executing the augmentation pass of the maximum matching algorithm. template -__global__ void kernel_augmentation(vertex_t* d_row_assignments, - vertex_t* d_col_assignments, - vertex_t const* d_row_elements, +__global__ void kernel_augmentation(vertex_t *d_row_assignments, + vertex_t *d_col_assignments, + vertex_t const *d_row_elements, VertexData d_row_data, - VertexData d_col_data, - vertex_t N, - vertex_t size) -{ + VertexData d_col_data, vertex_t N, + vertex_t size) { int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __augment(d_row_assignments, - d_col_assignments, - d_row_data.children, - d_col_data.children, - d_row_elements[id], - N); + __augment(d_row_assignments, d_col_assignments, d_row_data.children, + d_col_data.children, d_row_elements[id], N); } } @@ -446,21 +392,18 @@ __global__ void kernel_augmentation(vertex_t* d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_1(weight_t* d_sp_min, - weight_t const* d_col_slacks, - int const* d_col_covers, - int SP, - vertex_t N, - weight_t infinity) -{ +__global__ void kernel_dualUpdate_1(weight_t *d_sp_min, + weight_t const *d_col_slacks, + int const *d_col_covers, int SP, vertex_t N, + weight_t infinity) { int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { weight_t min = infinity; for (int colid = 0; colid < N; colid++) { - int index = spid * N + colid; + int index = spid * N + colid; weight_t slack = d_col_slacks[index]; - int col_cover = d_col_covers[index]; + int col_cover = d_col_covers[index]; if (col_cover == 0) if (slack < min) min = slack; @@ -474,29 +417,21 @@ __global__ void kernel_dualUpdate_1(weight_t* d_sp_min, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min, - weight_t* d_row_duals, - weight_t* d_col_duals, - weight_t* d_col_slacks, - int const* d_row_covers, - int const* d_col_covers, - int* d_row_visited, - vertex_t* d_col_parents, - int SP, - vertex_t N, - weight_t infinity, - weight_t epsilon) -{ +__global__ void kernel_dualUpdate_2( + weight_t const *d_sp_min, weight_t *d_row_duals, weight_t *d_col_duals, + weight_t *d_col_slacks, int const *d_row_covers, int const *d_col_covers, + int *d_row_visited, vertex_t *d_col_parents, int SP, vertex_t N, + weight_t infinity, weight_t epsilon) { int spid = blockIdx.y * blockDim.y + threadIdx.y; - int id = blockIdx.x * blockDim.x + threadIdx.x; + int id = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && id < N) { int index = spid * N + id; if (d_sp_min[spid] < infinity) { weight_t theta = d_sp_min[spid]; - int row_cover = d_row_covers[index]; - int col_cover = d_col_covers[index]; + int row_cover = d_row_covers[index]; + int col_cover = d_col_covers[index]; if (row_cover == 0) // Row vertex is reachable from source. d_row_duals[index] += theta; @@ -518,12 +453,10 @@ __global__ void kernel_dualUpdate_2(weight_t const* d_sp_min, // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual, - weight_t const* d_row_duals, - weight_t const* d_col_duals, - int SP, - vertex_t N) -{ +__global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual, + weight_t const *d_row_duals, + weight_t const *d_col_duals, int SP, + vertex_t N) { int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { @@ -538,12 +471,10 @@ __global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual, // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal, - weight_t const* d_costs, - vertex_t const* d_row_assignments, - int SP, - vertex_t N) -{ +__global__ void kernel_calcObjValPrimal(weight_t *d_obj_val_primal, + weight_t const *d_costs, + vertex_t const *d_row_assignments, + int SP, vertex_t N) { int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index 11d3174951..7a454f64e2 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -37,8 +37,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) -{ +void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, + cudaStream_t stream) { auto op = [scalar] __device__(InT in) { return OutT(in + scalar); }; unaryOp(out, in, len, op, stream); } @@ -57,24 +57,23 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s * @param stream cuda stream where to launch work */ template -void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) -{ +void add(OutT *out, const InT *in1, const InT *in2, IdxType len, + cudaStream_t stream) { auto op = [] __device__(InT a, InT b) { return OutT(a + b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void add_dev_scalar_kernel(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len) -{ +__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, + IdxType len) { IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } + if (i < len) { + outDev[i] = inDev[i] + *singleScalarDev; + } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and - * write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -84,16 +83,14 @@ __global__ void add_dev_scalar_kernel(math_t* outDev, * @param stream cuda stream */ template -void addDevScalar(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len, - cudaStream_t stream) -{ +void addDevScalar(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, IdxType len, + cudaStream_t stream) { // TODO: block dimension has not been tuned dim3 block(256); dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); + add_dev_scalar_kernel + <<>>(outDev, inDev, singleScalarDev, len); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index a49a433941..940d786e87 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -22,10 +22,10 @@ namespace raft { namespace linalg { -template -__global__ void binaryOpKernel( - OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op) -{ +template +__global__ void binaryOpKernel(OutType *out, const InType *in1, + const InType *in2, IdxType len, Lambda op) { typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a, b; @@ -42,11 +42,12 @@ __global__ void binaryOpKernel( c.store(out, idx); } -template -void binaryOpImpl( - OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) -{ - const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, + IdxType len, Lambda op, cudaStream_t stream) { + const IdxType nblks = + raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); binaryOpKernel <<>>(out, in1, in2, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -55,8 +56,8 @@ void binaryOpImpl( /** * @brief Checks if addresses are aligned on N bytes */ -inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N) -{ +inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, + uint64_t N) { return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; } @@ -76,36 +77,38 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint6 * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val1, const InType& val2);` */ -template -void binaryOp( - OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) -{ - constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t in1Addr = uint64_t(in1); - uint64_t in2Addr = uint64_t(in2); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) { +template +void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, + Lambda op, cudaStream_t stream) { + constexpr auto maxSize = + sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t in1Addr = uint64_t(in1); + uint64_t in2Addr = uint64_t(in2); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && + addressAligned(in1Addr, in2Addr, outAddr, 16)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { + } else if (8 / maxSize && bytes % 8 == 0 && + addressAligned(in1Addr, in2Addr, outAddr, 8)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) { + } else if (4 / maxSize && bytes % 4 == 0 && + addressAligned(in1Addr, in2Addr, outAddr, 4)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) { + } else if (2 / maxSize && bytes % 2 == 0 && + addressAligned(in1Addr, in2Addr, outAddr, 2)) { binaryOpImpl( out, in1, in2, len, op, stream); } else if (1 / maxSize) { binaryOpImpl( out, in1, in2, len, op, stream); } else { - binaryOpImpl(out, in1, in2, len, op, stream); + binaryOpImpl(out, in1, in2, len, + op, stream); } } diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index b129fe4758..b5a93c4953 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -122,16 +122,9 @@ namespace linalg { * conditioned systems. Negative values mean no regularizaton. */ template -void choleskyRank1Update(const raft::handle_t& handle, - math_t* L, - int n, - int ld, - void* workspace, - int* n_bytes, - cublasFillMode_t uplo, - cudaStream_t stream, - math_t eps = -1) -{ +void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, + void *workspace, int *n_bytes, cublasFillMode_t uplo, + cudaStream_t stream, math_t eps = -1) { // The matrix A' is defined as: // A' = [[A_11, A_12] // [A_21, A_22]] @@ -151,17 +144,18 @@ void choleskyRank1Update(const raft::handle_t& handle, // We need a workspace in device memory to store a scalar. Additionally, in // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. const int align = 256; - int offset = - (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo(sizeof(math_t) * (n - 1), align) : 0; + int offset = (uplo == CUBLAS_FILL_MODE_LOWER) + ? raft::alignTo(sizeof(math_t) * (n - 1), align) + : 0; if (workspace == nullptr) { *n_bytes = offset + 1 * sizeof(math_t); return; } - math_t* s = reinterpret_cast(((char*)workspace) + offset); - math_t* L_22 = L + (n - 1) * ld + n - 1; + math_t *s = reinterpret_cast(((char *)workspace) + offset); + math_t *L_22 = L + (n - 1) * ld + n - 1; - math_t* A_new; - math_t* A_row; + math_t *A_new; + math_t *A_row; if (uplo == CUBLAS_FILL_MODE_UPPER) { // A_new is stored as the n-1 th column of L A_new = L + (n - 1) * ld; @@ -170,36 +164,27 @@ void choleskyRank1Update(const raft::handle_t& handle, // as the n-th row of L. Since the matrix is column major, this is non // contiguous. We copy elements from A_row to a contiguous workspace A_new. A_row = L + n - 1; - A_new = reinterpret_cast(workspace); - CUBLAS_CHECK( - raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); + A_new = reinterpret_cast(workspace); + CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, + A_row, ld, A_new, 1, stream)); } - cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op = + (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; if (n > 1) { // Calculate L_12 = x by solving equation L_11 x = A_12 math_t alpha = 1; - CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(), - CUBLAS_SIDE_LEFT, - uplo, - op, - CUBLAS_DIAG_NON_UNIT, - n - 1, - 1, - &alpha, - L, - ld, - A_new, - n - 1, - stream)); + CUBLAS_CHECK(raft::linalg::cublastrsm( + handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op, + CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream)); // A_new now stores L_12, we calculate s = L_12 * L_12 - CUBLAS_CHECK( - raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); + CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, + A_new, 1, A_new, 1, s, stream)); if (uplo == CUBLAS_FILL_MODE_LOWER) { // Copy back the L_12 elements as the n-th row of L - CUBLAS_CHECK( - raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); + CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, + A_new, 1, A_row, ld, stream)); } } else { // n == 1 case CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); @@ -217,7 +202,9 @@ void choleskyRank1Update(const raft::handle_t& handle, // the system is very ill conditioned then the A_22 - L_12 * L_12 can be // negative, which would result L_22 = NaN. A small positive eps parameter // can be used to prevent this. - if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; } + if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { + L_22_host = eps; + } ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); raft::update_device(L_22, &L_22_host, 1, stream); } diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh index 7e0744f98a..ef983ff3d0 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -26,27 +26,18 @@ namespace linalg { // of the matrix, i.e. reduce along rows for row major or reduce along columns // for column major layout. Kernel does an inplace reduction adding to original // values of dots. -template -__global__ void coalescedReductionKernel(OutType* dots, - const InType* data, - int D, - int N, - OutType init, +template +__global__ void coalescedReductionKernel(OutType *dots, const InType *data, + int D, int N, OutType init, MainLambda main_op, ReduceLambda reduce_op, FinalLambda final_op, - bool inplace = false) -{ + bool inplace = false) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType thread_data = init; - IdxType rowStart = blockIdx.x * D; + IdxType rowStart = blockIdx.x * D; for (IdxType i = threadIdx.x; i < D; i += TPB) { IdxType idx = rowStart + i; thread_data = reduce_op(thread_data, main_op(data[idx], i)); @@ -88,37 +79,33 @@ __global__ void coalescedReductionKernel(OutType* dots, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void coalescedReduction(OutType* dots, - const InType* data, - int D, - int N, - OutType init, - cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void coalescedReduction(OutType *dots, const InType *data, int D, int N, + OutType init, cudaStream_t stream, bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) -{ + FinalLambda final_op = raft::Nop()) { // One block per reduction // Efficient only for large leading dimensions if (D <= 32) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); } else if (D <= 64) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); } else if (D <= 128) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); } else { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, + final_op, inplace); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index 35d9d96ea4..aa711a9140 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -55,7 +55,8 @@ namespace linalg { * thread block. This also determines the number of threads per * thread block */ -template +template struct KernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -100,7 +101,8 @@ struct KernelPolicy { }; // struct KernelPolicy -template +template struct ColKernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -149,8 +151,7 @@ struct ColKernelPolicy { * @{ */ template -struct Policy4x4 { -}; +struct Policy4x4 {}; template struct Policy4x4 { @@ -179,7 +180,8 @@ struct Policy4x4 { * @tparam Policy policy used to customize memory access behavior. * See documentation for `KernelPolicy` to know more. */ -template +template struct Contractions_NT { protected: typedef Policy P; @@ -245,7 +247,8 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem) + DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, + IdxT _k, char* _smem) : m(_m), n(_n), k(_k), @@ -262,9 +265,7 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) - { - } + pageRd(0) {} /** * @brief Ctor @@ -275,15 +276,8 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, - const DataT* _y, - IdxT _m, - IdxT _n, - IdxT _k, - IdxT _lda, - IdxT _ldb, - IdxT _ldd, - char* _smem) + DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, + IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) : m(_m), n(_n), k(_k), @@ -297,18 +291,17 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) - { + pageRd(0) { if (isRowMajor) { xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; - x = _x + xrowid * lda; - y = _y + yrowid * ldb; + x = _x + xrowid * lda; + y = _y + yrowid * ldb; } else { xrowid = IdxT(blockIdx.y) * P::Mblk; yrowid = IdxT(blockIdx.x) * P::Nblk; - x = _x + xrowid + srowid * lda; - y = _y + yrowid + srowid * ldb; + x = _x + xrowid + srowid * lda; + y = _y + yrowid + srowid * ldb; } } @@ -317,8 +310,7 @@ struct Contractions_NT { * @brief Load current block of X/Y from global memory to registers * @param[in] kidx current start index of k to be loaded */ - DI void ldgXY(IdxT kidx) - { + DI void ldgXY(IdxT kidx) { ldgX(kidx); ldgY(kidx); } @@ -327,8 +319,7 @@ struct Contractions_NT { * @brief Store current block of X/Y from registers to smem * @param[in] kidx current start index of k to be loaded */ - DI void stsXY() - { + DI void stsXY() { stsX(sx + pageWr * P::SmemPage); stsY(sy + pageWr * P::SmemPage); } @@ -337,15 +328,13 @@ struct Contractions_NT { * @brief Load X and Y block from shared memory to registers * @param[in] kidx k value from the current k-block to be loaded from smem */ - DI void ldsXY(int kidx) - { + DI void ldsXY(int kidx) { ldsX(kidx, sx + pageRd * P::SmemPage); ldsY(kidx, sy + pageRd * P::SmemPage); } private: - DI void ldgX(IdxT kidx) - { + DI void ldgX(IdxT kidx) { if (isRowMajor) { auto numRows = m; auto koffset = kidx + scolid; @@ -362,10 +351,11 @@ struct Contractions_NT { } } else { const auto numRows = k; - auto koffset = scolid; + auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { - if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) { + if ((koffset + xrowid) < lda && + (srowid + kidx + i * P::LdgRowsX) < numRows) { ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); } else { #pragma unroll @@ -377,8 +367,7 @@ struct Contractions_NT { } } - DI void ldgY(IdxT kidx) - { + DI void ldgY(IdxT kidx) { if (isRowMajor) { auto numRows = n; auto koffset = kidx + scolid; @@ -398,7 +387,8 @@ struct Contractions_NT { auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { - if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) { + if ((koffset + yrowid) < ldb && + (srowid + kidx + i * P::LdgRowsY) < numRows) { ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); } else { #pragma unroll @@ -410,8 +400,7 @@ struct Contractions_NT { } } - DI void stsX(DataT* smem) - { + DI void stsX(DataT* smem) { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { @@ -419,8 +408,7 @@ struct Contractions_NT { } } - DI void stsY(DataT* smem) - { + DI void stsY(DataT* smem) { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { @@ -428,8 +416,7 @@ struct Contractions_NT { } } - DI void ldsX(int kidx, DataT* smem) - { + DI void ldsX(int kidx, DataT* smem) { if (isRowMajor) { auto* saddr = smem + accrowid * P::SmemStride + kidx; #pragma unroll @@ -448,8 +435,7 @@ struct Contractions_NT { } } - DI void ldsY(int kidx, DataT* smem) - { + DI void ldsY(int kidx, DataT* smem) { if (isRowMajor) { auto* saddr = smem + acccolid * P::SmemStride + kidx; #pragma unroll diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 2d18691410..7c79e6c91d 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -25,7 +25,8 @@ #include #define _CUBLAS_ERR_TO_STR(err) \ - case err: return #err + case err: \ + return #err namespace raft { @@ -33,15 +34,15 @@ namespace raft { * @brief Exception thrown when a cuBLAS error is encountered. */ struct cublas_error : public raft::exception { - explicit cublas_error(char const* const message) : raft::exception(message) {} - explicit cublas_error(std::string const& message) : raft::exception(message) {} + explicit cublas_error(char const *const message) : raft::exception(message) {} + explicit cublas_error(std::string const &message) + : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char* cublas_error_to_string(cublasStatus_t err) -{ +inline const char *cublas_error_to_string(cublasStatus_t err) { switch (err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED); @@ -53,7 +54,8 @@ inline const char* cublas_error_to_string(cublasStatus_t err) _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR); - default: return "CUBLAS_STATUS_UNKNOWN"; + default: + return "CUBLAS_STATUS_UNKNOWN"; }; } @@ -69,19 +71,16 @@ inline const char* cublas_error_to_string(cublasStatus_t err) * Invokes a cuBLAS runtime API function call, if the call does not return * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred */ -#define CUBLAS_TRY(call) \ - do { \ - cublasStatus_t const status = (call); \ - if (CUBLAS_STATUS_SUCCESS != status) { \ - std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "cuBLAS error encountered at: ", \ - "call='%s', Reason=%d:%s", \ - #call, \ - status, \ - raft::linalg::detail::cublas_error_to_string(status)); \ - throw raft::cublas_error(msg); \ - } \ +#define CUBLAS_TRY(call) \ + do { \ + cublasStatus_t const status = (call); \ + if (CUBLAS_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG( \ + msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s", \ + #call, status, raft::linalg::detail::cublas_error_to_string(status)); \ + throw raft::cublas_error(msg); \ + } \ } while (0) /** FIXME: temporary alias for cuML compatibility */ @@ -108,39 +107,22 @@ namespace linalg { * @{ */ template -cublasStatus_t cublasaxpy(cublasHandle_t handle, - int n, - const T* alpha, - const T* x, - int incx, - T* y, - int incy, +cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha, + const T *x, int incx, T *y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, - int n, - const float* alpha, - const float* x, - int incx, - float* y, - int incy, - cudaStream_t stream) -{ +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, + const float *alpha, const float *x, int incx, + float *y, int incy, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSaxpy(handle, n, alpha, x, incx, y, incy); } template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, - int n, - const double* alpha, - const double* x, - int incx, - double* y, - int incy, - cudaStream_t stream) -{ +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, + const double *alpha, const double *x, int incx, + double *y, int incy, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } @@ -151,21 +133,21 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasSwap( - cublasHandle_t handle, int n, T* x, int incx, T* y, int incy, cudaStream_t stream); +cublasStatus_t cublasSwap(cublasHandle_t handle, int n, T *x, int incx, T *y, + int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasSwap( - cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream) -{ +inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, float *x, + int incx, float *y, int incy, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSswap(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasSwap( - cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream) -{ +inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x, + int incx, double *y, int incy, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDswap(handle, n, x, incx, y, incy); } @@ -177,20 +159,20 @@ inline cublasStatus_t cublasSwap( * @{ */ template -cublasStatus_t cublasCopy( - cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy, cudaStream_t stream); +cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx, + T *y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasCopy( - cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream) -{ +inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x, + int incx, float *y, int incy, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasScopy(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasCopy( - cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream) -{ +inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, + int incx, double *y, int incy, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDcopy(handle, n, x, incx, y, incy); } @@ -201,56 +183,31 @@ inline cublasStatus_t cublasCopy( * @{ */ template -cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, - int m, - int n, - const T* alfa, - const T* A, - int lda, - const T* x, - int incx, - const T* beta, - T* y, - int incy, +cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA, + int m, int n, const T *alfa, const T *A, int lda, + const T *x, int incx, const T *beta, T *y, int incy, cudaStream_t stream); template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, - int m, - int n, - const float* alfa, - const float* A, - int lda, - const float* x, - int incx, - const float* beta, - float* y, - int incy, - cudaStream_t stream) -{ + cublasOperation_t transA, int m, int n, + const float *alfa, const float *A, int lda, + const float *x, int incx, const float *beta, + float *y, int incy, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); + return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, + incy); } template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, - int m, - int n, - const double* alfa, - const double* A, - int lda, - const double* x, - int incx, - const double* beta, - double* y, - int incy, - cudaStream_t stream) -{ + cublasOperation_t transA, int m, int n, + const double *alfa, const double *A, int lda, + const double *x, int incx, const double *beta, + double *y, int incy, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); + return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, + incy); } /** @} */ @@ -259,47 +216,23 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasger(cublasHandle_t handle, - int m, - int n, - const T* alpha, - const T* x, - int incx, - const T* y, - int incy, - T* A, - int lda, - cudaStream_t stream); +cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha, + const T *x, int incx, const T *y, int incy, T *A, + int lda, cudaStream_t stream); template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, - int m, - int n, - const float* alpha, - const float* x, - int incx, - const float* y, - int incy, - float* A, - int lda, - cudaStream_t stream) -{ +inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, + const float *alpha, const float *x, int incx, + const float *y, int incy, float *A, int lda, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); } template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, - int m, - int n, - const double* alpha, - const double* x, - int incx, - const double* y, - int incy, - double* A, - int lda, - cudaStream_t stream) -{ +inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, + const double *alpha, const double *x, int incx, + const double *y, int incy, double *A, int lda, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); } @@ -310,62 +243,34 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasgemm(cublasHandle_t handle, - cublasOperation_t transA, - cublasOperation_t transB, - int m, - int n, - int k, - const T* alfa, - const T* A, - int lda, - const T* B, - int ldb, - const T* beta, - T* C, - int ldc, +cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, + cublasOperation_t transB, int m, int n, int k, + const T *alfa, const T *A, int lda, const T *B, + int ldb, const T *beta, T *C, int ldc, cudaStream_t stream); template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, - int m, - int n, - int k, - const float* alfa, - const float* A, - int lda, - const float* B, - int ldb, - const float* beta, - float* C, - int ldc, - cudaStream_t stream) -{ + cublasOperation_t transB, int m, int n, int k, + const float *alfa, const float *A, int lda, + const float *B, int ldb, const float *beta, + float *C, int ldc, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); + return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, + beta, C, ldc); } template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, - int m, - int n, - int k, - const double* alfa, - const double* A, - int lda, - const double* B, - int ldb, - const double* beta, - double* C, - int ldc, - cudaStream_t stream) -{ + cublasOperation_t transB, int m, int n, int k, + const double *alfa, const double *A, int lda, + const double *B, int ldb, const double *beta, + double *C, int ldc, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); + return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, + beta, C, ldc); } /** @} */ @@ -376,93 +281,38 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, template cublasStatus_t cublasgemmBatched(cublasHandle_t handle, // NOLINT cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const T* alpha, - const T* const Aarray[], // NOLINT - int lda, - const T* const Barray[], // NOLINT - int ldb, - const T* beta, - T* Carray[], // NOLINT - int ldc, - int batchCount, - cudaStream_t stream); + cublasOperation_t transb, int m, int n, int k, + const T *alpha, + const T *const Aarray[], // NOLINT + int lda, const T *const Barray[], // NOLINT + int ldb, const T *beta, + T *Carray[], // NOLINT + int ldc, int batchCount, cudaStream_t stream); template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float* alpha, - const float* const Aarray[], // NOLINT - int lda, - const float* const Barray[], // NOLINT - int ldb, - const float* beta, - float* Carray[], // NOLINT - int ldc, - int batchCount, - cudaStream_t stream) -{ + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const float *alpha, + const float *const Aarray[], // NOLINT + int lda, const float *const Barray[], // NOLINT + int ldb, const float *beta, float *Carray[], // NOLINT + int ldc, int batchCount, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmBatched(handle, - transa, - transb, - m, - n, - k, - alpha, - Aarray, - lda, - Barray, - ldb, - beta, - Carray, - ldc, - batchCount); + return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, + Barray, ldb, beta, Carray, ldc, batchCount); } template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const double* alpha, - const double* const Aarray[], // NOLINT - int lda, - const double* const Barray[], // NOLINT - int ldb, - const double* beta, - double* Carray[], // NOLINT - int ldc, - int batchCount, - cudaStream_t stream) -{ + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const double *alpha, + const double *const Aarray[], // NOLINT + int lda, const double *const Barray[], // NOLINT + int ldb, const double *beta, double *Carray[], // NOLINT + int ldc, int batchCount, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmBatched(handle, - transa, - transb, - m, - n, - k, - alpha, - Aarray, - lda, - Barray, - ldb, - beta, - Carray, - ldc, - batchCount); + return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, + Barray, ldb, beta, Carray, ldc, batchCount); } /** @} */ @@ -472,110 +322,36 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT */ template cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const T* alpha, - const T* const Aarray, - int lda, - int64_t strideA, - const T* const Barray, - int ldb, - int64_t strideB, - const T* beta, - T* Carray, - int ldc, - int64_t strideC, - int batchCount, + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const T *alpha, const T *const Aarray, int lda, + int64_t strideA, const T *const Barray, int ldb, int64_t strideB, + const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount, cudaStream_t stream); template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float* alpha, - const float* const Aarray, - int lda, - int64_t strideA, - const float* const Barray, - int ldb, - int64_t strideB, - const float* beta, - float* Carray, - int ldc, - int64_t strideC, - int batchCount, - cudaStream_t stream) -{ + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const float *alpha, const float *const Aarray, int lda, + int64_t strideA, const float *const Barray, int ldb, int64_t strideB, + const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmStridedBatched(handle, - transa, - transb, - m, - n, - k, - alpha, - Aarray, - lda, - strideA, - Barray, - ldb, - strideB, - beta, - Carray, - ldc, - strideC, - batchCount); + return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, + Aarray, lda, strideA, Barray, ldb, strideB, + beta, Carray, ldc, strideC, batchCount); } template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const double* alpha, - const double* const Aarray, - int lda, - int64_t strideA, - const double* const Barray, - int ldb, - int64_t strideB, - const double* beta, - double* Carray, - int ldc, - int64_t strideC, - int batchCount, - cudaStream_t stream) -{ + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const double *alpha, const double *const Aarray, int lda, + int64_t strideA, const double *const Barray, int ldb, int64_t strideB, + const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmStridedBatched(handle, - transa, - transb, - m, - n, - k, - alpha, - Aarray, - lda, - strideA, - Barray, - ldb, - strideB, - beta, - Carray, - ldc, - strideC, - batchCount); + return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, + Aarray, lda, strideA, Barray, ldb, strideB, + beta, Carray, ldc, strideC, batchCount); } /** @} */ @@ -585,85 +361,51 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT */ template -cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, - int n, // NOLINT - T* const A[], // NOLINT - int lda, - int* P, - int* info, - int batchSize, +cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n, // NOLINT + T *const A[], // NOLINT + int lda, int *P, int *info, int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, - float* const A[], // NOLINT - int lda, - int* P, - int* info, - int batchSize, - cudaStream_t stream) -{ +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, float *const A[], // NOLINT + int lda, int *P, int *info, + int batchSize, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); } template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, - double* const A[], // NOLINT - int lda, - int* P, - int* info, - int batchSize, - cudaStream_t stream) -{ +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, double *const A[], // NOLINT + int lda, int *P, int *info, + int batchSize, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); } template -cublasStatus_t cublasgetriBatched(cublasHandle_t handle, - int n, // NOLINT - const T* const A[], // NOLINT - int lda, - const int* P, - T* const C[], // NOLINT - int ldc, - int* info, - int batchSize, +cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n, // NOLINT + const T *const A[], // NOLINT + int lda, const int *P, + T *const C[], // NOLINT + int ldc, int *info, int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, - int n, - const float* const A[], // NOLINT - int lda, - const int* P, - float* const C[], // NOLINT - int ldc, - int* info, - int batchSize, - cudaStream_t stream) -{ +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, int n, const float *const A[], // NOLINT + int lda, const int *P, float *const C[], // NOLINT + int ldc, int *info, int batchSize, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, - int n, - const double* const A[], // NOLINT - int lda, - const int* P, - double* const C[], // NOLINT - int ldc, - int* info, - int batchSize, - cudaStream_t stream) -{ +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, int n, const double *const A[], // NOLINT + int lda, const int *P, double *const C[], // NOLINT + int ldc, int *info, int batchSize, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -677,57 +419,34 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT template inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, - int m, - int n, - int nrhs, - T* Aarray[], // NOLINT - int lda, - T* Carray[], // NOLINT - int ldc, - int* info, - int* devInfoArray, - int batchSize, - cudaStream_t stream); + cublasOperation_t trans, int m, int n, + int nrhs, T *Aarray[], // NOLINT + int lda, T *Carray[], // NOLINT + int ldc, int *info, int *devInfoArray, + int batchSize, cudaStream_t stream); template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, - int m, - int n, - int nrhs, - float* Aarray[], // NOLINT - int lda, - float* Carray[], // NOLINT - int ldc, - int* info, - int* devInfoArray, - int batchSize, - cudaStream_t stream) -{ + cublasOperation_t trans, int m, int n, + int nrhs, float *Aarray[], // NOLINT + int lda, float *Carray[], // NOLINT + int ldc, int *info, int *devInfoArray, + int batchSize, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgelsBatched( - handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); + return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, + info, devInfoArray, batchSize); } template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, - int m, - int n, - int nrhs, - double* Aarray[], // NOLINT - int lda, - double* Carray[], // NOLINT - int ldc, - int* info, - int* devInfoArray, - int batchSize, - cudaStream_t stream) -{ + cublasOperation_t trans, int m, int n, + int nrhs, double *Aarray[], // NOLINT + int lda, double *Carray[], // NOLINT + int ldc, int *info, int *devInfoArray, + int batchSize, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgelsBatched( - handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); + return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, + info, devInfoArray, batchSize); } /** @} */ @@ -737,59 +456,33 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT * @{ */ template -cublasStatus_t cublasgeam(cublasHandle_t handle, - cublasOperation_t transA, - cublasOperation_t transB, - int m, - int n, - const T* alfa, - const T* A, - int lda, - const T* beta, - const T* B, - int ldb, - T* C, - int ldc, - cudaStream_t stream); +cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, + cublasOperation_t transB, int m, int n, const T *alfa, + const T *A, int lda, const T *beta, const T *B, + int ldb, T *C, int ldc, cudaStream_t stream); template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, - int m, - int n, - const float* alfa, - const float* A, - int lda, - const float* beta, - const float* B, - int ldb, - float* C, - int ldc, - cudaStream_t stream) -{ + cublasOperation_t transB, int m, int n, + const float *alfa, const float *A, int lda, + const float *beta, const float *B, int ldb, + float *C, int ldc, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); + return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, + C, ldc); } template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, - int m, - int n, - const double* alfa, - const double* A, - int lda, - const double* beta, - const double* B, - int ldb, - double* C, - int ldc, - cudaStream_t stream) -{ + cublasOperation_t transB, int m, int n, + const double *alfa, const double *A, int lda, + const double *beta, const double *B, int ldb, + double *C, int ldc, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); + return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, + C, ldc); } /** @} */ @@ -798,59 +491,31 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublassymm(cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - int m, - int n, - const T* alpha, - const T* A, - int lda, - const T* B, - int ldb, - const T* beta, - T* C, - int ldc, - cudaStream_t stream); +cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, int m, int n, const T *alpha, + const T *A, int lda, const T *B, int ldb, + const T *beta, T *C, int ldc, cudaStream_t stream); template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - int m, - int n, - const float* alpha, - const float* A, - int lda, - const float* B, - int ldb, - const float* beta, - float* C, - int ldc, - cudaStream_t stream) -{ +inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, int m, int n, + const float *alpha, const float *A, int lda, + const float *B, int ldb, const float *beta, + float *C, int ldc, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); + return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, + ldc); } template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - int m, - int n, - const double* alpha, - const double* A, - int lda, - const double* B, - int ldb, - const double* beta, - double* C, - int ldc, - cudaStream_t stream) -{ +inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, int m, int n, + const double *alpha, const double *A, int lda, + const double *B, int ldb, const double *beta, + double *C, int ldc, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); + return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, + ldc); } /** @} */ @@ -859,51 +524,27 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublassyrk(cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const T* alpha, - const T* A, - int lda, - const T* beta, - T* C, - int ldc, +cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, + cublasOperation_t trans, int n, int k, const T *alpha, + const T *A, int lda, const T *beta, T *C, int ldc, cudaStream_t stream); template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const float* alpha, - const float* A, - int lda, - const float* beta, - float* C, - int ldc, - cudaStream_t stream) -{ +inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, + cublasOperation_t trans, int n, int k, + const float *alpha, const float *A, int lda, + const float *beta, float *C, int ldc, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const double* alpha, - const double* A, - int lda, - const double* beta, - double* C, - int ldc, - cudaStream_t stream) -{ +inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, + cublasOperation_t trans, int n, int k, + const double *alpha, const double *A, int lda, + const double *beta, double *C, int ldc, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } @@ -914,77 +555,52 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasnrm2( - cublasHandle_t handle, int n, const T* x, int incx, T* result, cudaStream_t stream); +cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx, + T *result, cudaStream_t stream); template <> -inline cublasStatus_t cublasnrm2( - cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream) -{ +inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x, + int incx, float *result, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSnrm2(handle, n, x, incx, result); } template <> -inline cublasStatus_t cublasnrm2( - cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream) -{ +inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x, + int incx, double *result, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDnrm2(handle, n, x, incx, result); } /** @} */ template -cublasStatus_t cublastrsm(cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const T* alpha, - const T* A, - int lda, - T* B, - int ldb, +cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, cublasOperation_t trans, + cublasDiagType_t diag, int m, int n, const T *alpha, + const T *A, int lda, T *B, int ldb, cudaStream_t stream); template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const float* alpha, - const float* A, - int lda, - float* B, - int ldb, - cudaStream_t stream) -{ +inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, cublasOperation_t trans, + cublasDiagType_t diag, int m, int n, + const float *alpha, const float *A, int lda, + float *B, int ldb, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); + return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, + ldb); } template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const double* alpha, - const double* A, - int lda, - double* B, - int ldb, - cudaStream_t stream) -{ +inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, cublasOperation_t trans, + cublasDiagType_t diag, int m, int n, + const double *alpha, const double *A, int lda, + double *B, int ldb, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); + return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, + ldb); } /** @@ -992,39 +608,21 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasdot(cublasHandle_t handle, - int n, - const T* x, - int incx, - const T* y, - int incy, - T* result, - cudaStream_t stream); +cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx, + const T *y, int incy, T *result, cudaStream_t stream); template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, - int n, - const float* x, - int incx, - const float* y, - int incy, - float* result, - cudaStream_t stream) -{ +inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x, + int incx, const float *y, int incy, + float *result, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSdot(handle, n, x, incx, y, incy, result); } template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, - int n, - const double* x, - int incx, - const double* y, - int incy, - double* result, - cudaStream_t stream) -{ +inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, + int incx, const double *y, int incy, + double *result, cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDdot(handle, n, x, incx, y, incy, result); } @@ -1044,8 +642,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, // template<> inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, cublasPointerMode_t mode, - cudaStream_t stream) -{ + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSetPointerMode(handle, mode); } @@ -1056,21 +653,21 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasscal( - cublasHandle_t handle, int n, const T* alpha, T* x, int incx, cudaStream_t stream); +cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x, + int incx, cudaStream_t stream); template <> -inline cublasStatus_t cublasscal( - cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream) -{ +inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, + const float *alpha, float *x, int incx, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSscal(handle, n, alpha, x, incx); } template <> -inline cublasStatus_t cublasscal( - cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream) -{ +inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, + const double *alpha, double *x, int incx, + cudaStream_t stream) { CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDscal(handle, n, alpha, x, incx); } diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index 76a9f40f4d..0eadf47fe3 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -24,7 +24,8 @@ #include #define _CUSOLVER_ERR_TO_STR(err) \ - case err: return #err; + case err: \ + return #err; namespace raft { @@ -32,15 +33,16 @@ namespace raft { * @brief Exception thrown when a cuSOLVER error is encountered. */ struct cusolver_error : public raft::exception { - explicit cusolver_error(char const* const message) : raft::exception(message) {} - explicit cusolver_error(std::string const& message) : raft::exception(message) {} + explicit cusolver_error(char const *const message) + : raft::exception(message) {} + explicit cusolver_error(std::string const &message) + : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char* cusolver_error_to_string(cusolverStatus_t err) -{ +inline const char *cusolver_error_to_string(cusolverStatus_t err) { switch (err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED); @@ -52,7 +54,8 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err) _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED); - default: return "CUSOLVER_STATUS_UNKNOWN"; + default: + return "CUSOLVER_STATUS_UNKNOWN"; }; } @@ -73,11 +76,8 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err) cusolverStatus_t const status = (call); \ if (CUSOLVER_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "cuSOLVER error encountered at: ", \ - "call='%s', Reason=%d:%s", \ - #call, \ - status, \ + SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ", \ + "call='%s', Reason=%d:%s", #call, status, \ raft::linalg::detail::cusolver_error_to_string(status)); \ throw raft::cusolver_error(msg); \ } \ @@ -107,76 +107,42 @@ namespace linalg { * @{ */ template -cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, - int m, // NOLINT - int n, - T* A, - int lda, - T* Workspace, - int* devIpiv, - int* devInfo, +cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m, // NOLINT + int n, T *A, int lda, T *Workspace, + int *devIpiv, int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, - int n, - float* A, - int lda, - float* Workspace, - int* devIpiv, - int* devInfo, - cudaStream_t stream) -{ + int m, int n, float *A, int lda, + float *Workspace, int *devIpiv, + int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, - int n, - double* A, - int lda, - double* Workspace, - int* devIpiv, - int* devInfo, - cudaStream_t stream) -{ + int m, int n, double *A, int lda, + double *Workspace, int *devIpiv, + int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - T* A, - int lda, - int* Lwork); + cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - float* A, - int lda, - int* Lwork) -{ + cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - double* A, - int lda, - int* Lwork) -{ + cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork); } @@ -186,49 +152,30 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, - int n, - int nrhs, - const T* A, - int lda, - const int* devIpiv, - T* B, - int ldb, - int* devInfo, - cudaStream_t stream); + cublasOperation_t trans, int n, int nrhs, + const T *A, int lda, const int *devIpiv, T *B, + int ldb, int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, - int n, - int nrhs, - const float* A, - int lda, - const int* devIpiv, - float* B, - int ldb, - int* devInfo, - cudaStream_t stream) -{ + cublasOperation_t trans, int n, + int nrhs, const float *A, int lda, + const int *devIpiv, float *B, int ldb, + int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); + return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, + devInfo); } template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, - int n, - int nrhs, - const double* A, - int lda, - const int* devIpiv, - double* B, - int ldb, - int* devInfo, - cudaStream_t stream) -{ + cublasOperation_t trans, int n, + int nrhs, const double *A, int lda, + const int *devIpiv, double *B, int ldb, + int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); + return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, + devInfo); } /** @} */ @@ -238,40 +185,20 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - const T* A, - int lda, - const T* W, - int* lwork); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const T *A, int lda, const T *W, int *lwork); template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - const float* A, - int lda, - const float* W, - int* lwork) -{ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const float *A, int lda, const float *W, int *lwork) { return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - const double* A, - int lda, - const double* W, - int* lwork) -{ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const double *A, int lda, const double *W, int *lwork) { return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } /** @} */ @@ -282,96 +209,52 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - T* A, - int lda, - T* W, - T* work, - int lwork, - int* info, - syevjInfo_t params, + cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, T *A, int lda, T *W, T *work, int lwork, + int *info, syevjInfo_t params, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - float* A, - int lda, - float* W, - float* work, - int lwork, - int* info, - syevjInfo_t params, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, float *A, int lda, float *W, float *work, int lwork, int *info, + syevjInfo_t params, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); + return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, + params); } template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - double* A, - int lda, - double* W, - double* work, - int lwork, - int* info, - syevjInfo_t params, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, double *A, int lda, double *W, double *work, int lwork, int *info, + syevjInfo_t params, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); + return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, + params); } template cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - const T* A, - int lda, - const T* W, - int* lwork, - syevjInfo_t params); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params); template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - const float* A, - int lda, - const float* W, - int* lwork, - syevjInfo_t params) -{ - return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const float *A, int lda, const float *W, int *lwork, + syevjInfo_t params) { + return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, + params); } template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - const double* A, - int lda, - const double* W, - int* lwork, - syevjInfo_t params) -{ - return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, const double *A, int lda, const double *W, int *lwork, + syevjInfo_t params) { + return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, + params); } /** @} */ @@ -381,49 +264,32 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - T* A, - int lda, - T* W, - T* work, - int lwork, - int* devInfo, - cudaStream_t stream); + cusolverEigMode_t jobz, cublasFillMode_t uplo, + int n, T *A, int lda, T *W, T *work, int lwork, + int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - float* A, - int lda, - float* W, - float* work, - int lwork, - int* devInfo, - cudaStream_t stream) -{ + cublasFillMode_t uplo, int n, float *A, + int lda, float *W, float *work, + int lwork, int *devInfo, + cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); + return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, + devInfo); } template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, - int n, - double* A, - int lda, - double* W, - double* work, - int lwork, - int* devInfo, - cudaStream_t stream) -{ + cublasFillMode_t uplo, int n, double *A, + int lda, double *W, double *work, + int lwork, int *devInfo, + cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); + return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, + devInfo); } /** @} */ @@ -431,134 +297,57 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT /** * @defgroup syevdx cusolver syevdx operations * @{ - */ +*/ template cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cusolverEigRange_t range, - cublasFillMode_t uplo, - int n, - const T* A, - int lda, - T vl, - T vu, - int il, - int iu, - int* h_meig, - const T* W, - int* lwork); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu, + int *h_meig, const T *W, int *lwork); template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cusolverEigRange_t range, - cublasFillMode_t uplo, - int n, - const float* A, - int lda, - float vl, - float vu, - int il, - int iu, - int* h_meig, - const float* W, - int* lwork) -{ - return cusolverDnSsyevdx_bufferSize( - handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu, + int il, int iu, int *h_meig, const float *W, int *lwork) { + return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, + vu, il, iu, h_meig, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cusolverEigRange_t range, - cublasFillMode_t uplo, - int n, - const double* A, - int lda, - double vl, - double vu, - int il, - int iu, - int* h_meig, - const double* W, - int* lwork) -{ - return cusolverDnDsyevdx_bufferSize( - handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu, + int il, int iu, int *h_meig, const double *W, int *lwork) { + return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, + vu, il, iu, h_meig, W, lwork); } template cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cusolverEigRange_t range, - cublasFillMode_t uplo, - int n, - T* A, - int lda, - T vl, - T vu, - int il, - int iu, - int* h_meig, - T* W, - T* work, - int lwork, - int* devInfo, - cudaStream_t stream); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu, + int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cusolverEigRange_t range, - cublasFillMode_t uplo, - int n, - float* A, - int lda, - float vl, - float vu, - int il, - int iu, - int* h_meig, - float* W, - float* work, - int lwork, - int* devInfo, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il, + int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo, + cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevdx( - handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); + return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, + h_meig, W, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - cusolverEigRange_t range, - cublasFillMode_t uplo, - int n, - double* A, - int lda, - double vl, - double vu, - int il, - int iu, - int* h_meig, - double* W, - double* work, - int lwork, - int* devInfo, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, + cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu, + int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo, + cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevdx( - handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); + return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, + h_meig, W, work, lwork, devInfo); } /** @} */ #endif @@ -569,11 +358,7 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT */ template cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - int* lwork) -{ + cusolverDnHandle_t handle, int m, int n, int *lwork) { if (std::is_same, float>::value) { return cusolverDnSgesvd_bufferSize(handle, m, n, lwork); } else { @@ -582,194 +367,72 @@ cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT } template cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, - signed char jobu, - signed char jobvt, - int m, - int n, - T* A, - int lda, - T* S, - T* U, - int ldu, - T* VT, - int ldvt, - T* work, - int lwork, - T* rwork, - int* devInfo, - cudaStream_t stream); + cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, + T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork, + T *rwork, int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, - signed char jobu, - signed char jobvt, - int m, - int n, - float* A, - int lda, - float* S, - float* U, - int ldu, - float* VT, - int ldvt, - float* work, - int lwork, - float* rwork, - int* devInfo, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, + float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, + float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvd( - handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); + return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, + ldvt, work, lwork, rwork, devInfo); } template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, - signed char jobu, - signed char jobvt, - int m, - int n, - double* A, - int lda, - double* S, - double* U, - int ldu, - double* VT, - int ldvt, - double* work, - int lwork, - double* rwork, - int* devInfo, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, + double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, + double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvd( - handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); + return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, + ldvt, work, lwork, rwork, devInfo); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - int econ, - int m, - int n, - const T* A, - int lda, - const T* S, - const T* U, - int ldu, - const T* V, - int ldv, - int* lwork, - gesvdjInfo_t params); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv, + int *lwork, gesvdjInfo_t params); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - int econ, - int m, - int n, - const float* A, - int lda, - const float* S, - const float* U, - int ldu, - const float* V, - int ldv, - int* lwork, - gesvdjInfo_t params) -{ - return cusolverDnSgesvdj_bufferSize( - handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + const float *A, int lda, const float *S, const float *U, int ldu, + const float *V, int ldv, int *lwork, gesvdjInfo_t params) { + return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, + ldu, V, ldv, lwork, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - int econ, - int m, - int n, - const double* A, - int lda, - const double* S, - const double* U, - int ldu, - const double* V, - int ldv, - int* lwork, - gesvdjInfo_t params) -{ - return cusolverDnDgesvdj_bufferSize( - handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + const double *A, int lda, const double *S, const double *U, int ldu, + const double *V, int ldv, int *lwork, gesvdjInfo_t params) { + return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, + ldu, V, ldv, lwork, params); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - int econ, - int m, - int n, - T* A, - int lda, - T* S, - T* U, - int ldu, - T* V, - int ldv, - T* work, - int lwork, - int* info, - gesvdjInfo_t params, - cudaStream_t stream); + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork, + int *info, gesvdjInfo_t params, cudaStream_t stream); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - int econ, - int m, - int n, - float* A, - int lda, - float* S, - float* U, - int ldu, - float* V, - int ldv, - float* work, - int lwork, - int* info, - gesvdjInfo_t params, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + float *A, int lda, float *S, float *U, int ldu, float *V, int ldv, + float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvdj( - handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); + return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, + work, lwork, info, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, - cusolverEigMode_t jobz, - int econ, - int m, - int n, - double* A, - int lda, - double* S, - double* U, - int ldu, - double* V, - int ldv, - double* work, - int lwork, - int* info, - gesvdjInfo_t params, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, + double *A, int lda, double *S, double *U, int ldu, double *V, int ldv, + double *work, int lwork, int *info, gesvdjInfo_t params, + cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvdj( - handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); + return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, + work, lwork, info, params); } /** @} */ @@ -779,74 +442,43 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT */ template cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cublasFillMode_t uplo, - int n, - T* A, - int lda, - int* Lwork); + cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda, + int *Lwork); template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cublasFillMode_t uplo, - int n, - float* A, - int lda, - int* Lwork) -{ + cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda, + int *Lwork) { return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cublasFillMode_t uplo, - int n, - double* A, - int lda, - int* Lwork) -{ + cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda, + int *Lwork) { return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, - int n, - T* A, - int lda, - T* Workspace, - int Lwork, - int* devInfo, - cudaStream_t stream); + cublasFillMode_t uplo, int n, T *A, + int lda, T *Workspace, int Lwork, + int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, - int n, - float* A, - int lda, - float* Workspace, - int Lwork, - int* devInfo, - cudaStream_t stream) -{ + cublasFillMode_t uplo, int n, float *A, + int lda, float *Workspace, int Lwork, + int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, - int n, - double* A, - int lda, - double* Workspace, - int Lwork, - int* devInfo, - cudaStream_t stream) -{ + cublasFillMode_t uplo, int n, double *A, + int lda, double *Workspace, int Lwork, + int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } @@ -858,44 +490,26 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, - int n, - int nrhs, - const T* A, - int lda, - T* B, - int ldb, - int* devInfo, - cudaStream_t stream); + cublasFillMode_t uplo, int n, int nrhs, + const T *A, int lda, T *B, int ldb, + int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, - int n, - int nrhs, - const float* A, - int lda, - float* B, - int ldb, - int* devInfo, - cudaStream_t stream) -{ + cublasFillMode_t uplo, int n, int nrhs, + const float *A, int lda, float *B, + int ldb, int *devInfo, + cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, - int n, - int nrhs, - const double* A, - int lda, - double* B, - int ldb, - int* devInfo, - cudaStream_t stream) -{ + cublasFillMode_t uplo, int n, int nrhs, + const double *A, int lda, double *B, + int ldb, int *devInfo, + cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } @@ -906,75 +520,38 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT * @{ */ template -cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, - int m, // NOLINT - int n, - T* A, - int lda, - T* TAU, - T* Workspace, - int Lwork, - int* devInfo, - cudaStream_t stream); +cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m, // NOLINT + int n, T *A, int lda, T *TAU, T *Workspace, + int Lwork, int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, - int n, - float* A, - int lda, - float* TAU, - float* Workspace, - int Lwork, - int* devInfo, - cudaStream_t stream) -{ + int m, int n, float *A, int lda, + float *TAU, float *Workspace, int Lwork, + int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, - int n, - double* A, - int lda, - double* TAU, - double* Workspace, - int Lwork, - int* devInfo, - cudaStream_t stream) -{ + int m, int n, double *A, int lda, + double *TAU, double *Workspace, + int Lwork, int *devInfo, + cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - T* A, - int lda, - int* Lwork); + cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - float* A, - int lda, - int* Lwork) -{ + cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - double* A, - int lda, - int* Lwork) -{ + cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } /** @} */ @@ -985,86 +562,38 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - int k, - T* A, - int lda, - const T* tau, - T* work, - int lwork, - int* devInfo, - cudaStream_t stream); + cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau, + T *work, int lwork, int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - int k, - float* A, - int lda, - const float* tau, - float* work, - int lwork, - int* devInfo, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda, + const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - int k, - double* A, - int lda, - const double* tau, - double* work, - int lwork, - int* devInfo, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda, + const double *tau, double *work, int lwork, int *devInfo, + cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - int k, - const T* A, - int lda, - const T* TAU, - int* lwork); + cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda, + const T *TAU, int *lwork); template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - int k, - const float* A, - int lda, - const float* TAU, - int* lwork) -{ + cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda, + const float *TAU, int *lwork) { return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, - int m, - int n, - int k, - const double* A, - int lda, - const double* TAU, - int* lwork) -{ + cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda, + const double *TAU, int *lwork) { return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } /** @} */ @@ -1075,114 +604,53 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle, // NOLINT - cublasSideMode_t side, - cublasOperation_t trans, - int m, - int n, - int k, - const T* A, - int lda, - const T* tau, - T* C, - int ldc, - T* work, - int lwork, - int* devInfo, - cudaStream_t stream); + cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const T *A, int lda, + const T *tau, T *C, int ldc, T *work, + int lwork, int *devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, - cublasSideMode_t side, - cublasOperation_t trans, - int m, - int n, - int k, - const float* A, - int lda, - const float* tau, - float* C, - int ldc, - float* work, - int lwork, - int* devInfo, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const float *A, int lda, const float *tau, float *C, + int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); + return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, + work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, - cublasSideMode_t side, - cublasOperation_t trans, - int m, - int n, - int k, - const double* A, - int lda, - const double* tau, - double* C, - int ldc, - double* work, - int lwork, - int* devInfo, - cudaStream_t stream) -{ + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const double *A, int lda, const double *tau, double *C, + int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) { CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); + return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, + work, lwork, devInfo); } template cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cublasSideMode_t side, - cublasOperation_t trans, - int m, - int n, - int k, - const T* A, - int lda, - const T* tau, - const T* C, - int ldc, - int* lwork); + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc, + int *lwork); template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cublasSideMode_t side, - cublasOperation_t trans, - int m, - int n, - int k, - const float* A, - int lda, - const float* tau, - const float* C, - int ldc, - int* lwork) -{ - return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const float *A, int lda, const float *tau, + const float *C, int ldc, int *lwork) { + return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, + C, ldc, lwork); } template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, - cublasSideMode_t side, - cublasOperation_t trans, - int m, - int n, - int k, - const double* A, - int lda, - const double* tau, - const double* C, - int ldc, - int* lwork) -{ - return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); + cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, + int m, int n, int k, const double *A, int lda, const double *tau, + const double *C, int ldc, int *lwork) { + return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, + C, ldc, lwork); } /** @} */ @@ -1192,136 +660,62 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const T* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - int batchSize, - csrqrInfo_t info, - size_t* internalDataInBytes, - size_t* workspaceInBytes); + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, + const int *csrColIndA, int batchSize, csrqrInfo_t info, + size_t *internalDataInBytes, size_t *workspaceInBytes); template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const float* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - int batchSize, - csrqrInfo_t info, - size_t* internalDataInBytes, - size_t* workspaceInBytes) -{ - return cusolverSpScsrqrBufferInfoBatched(handle, - m, - n, - nnzA, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - batchSize, - info, - internalDataInBytes, - workspaceInBytes); + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, + const int *csrColIndA, int batchSize, csrqrInfo_t info, + size_t *internalDataInBytes, size_t *workspaceInBytes) { + return cusolverSpScsrqrBufferInfoBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, + info, internalDataInBytes, workspaceInBytes); } template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const double* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - int batchSize, - csrqrInfo_t info, - size_t* internalDataInBytes, - size_t* workspaceInBytes) -{ - return cusolverSpDcsrqrBufferInfoBatched(handle, - m, - n, - nnzA, - descrA, - csrValA, - csrRowPtrA, - csrColIndA, - batchSize, - info, - internalDataInBytes, - workspaceInBytes); + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, + const int *csrColIndA, int batchSize, csrqrInfo_t info, + size_t *internalDataInBytes, size_t *workspaceInBytes) { + return cusolverSpDcsrqrBufferInfoBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, + info, internalDataInBytes, workspaceInBytes); } template cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const T* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const T* b, - T* x, - int batchSize, - csrqrInfo_t info, - void* pBuffer, - cudaStream_t stream); + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, + const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info, + void *pBuffer, cudaStream_t stream); template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const float* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const float* b, - float* x, - int batchSize, - csrqrInfo_t info, - void* pBuffer, - cudaStream_t stream) -{ + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, + const int *csrColIndA, const float *b, float *x, int batchSize, + csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpScsrqrsvBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); + return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, + csrRowPtrA, csrColIndA, b, x, batchSize, + info, pBuffer); } template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const double* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const double* b, - double* x, - int batchSize, - csrqrInfo_t info, - void* pBuffer, - cudaStream_t stream) -{ + cusolverSpHandle_t handle, int m, int n, int nnzA, + const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, + const int *csrColIndA, const double *b, double *x, int batchSize, + csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpDcsrqrsvBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); + return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, + csrRowPtrA, csrColIndA, b, x, batchSize, + info, pBuffer); } /** @} */ diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh index 562a3d8991..c848ac1f4b 100644 --- a/cpp/include/raft/linalg/divide.cuh +++ b/cpp/include/raft/linalg/divide.cuh @@ -33,10 +33,11 @@ namespace linalg { * @{ */ template -void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) -{ +void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, + cudaStream_t stream) { unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream); + out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, + stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 75e77ac0ce..6172618380 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -41,43 +41,26 @@ namespace linalg { * @{ */ template -void eigDC(const raft::handle_t& handle, - const math_t* in, - int n_rows, - int n_cols, - math_t* eig_vectors, - math_t* eig_vals, - cudaStream_t stream) -{ - auto allocator = handle.get_device_allocator(); +void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows, + int n_cols, math_t *eig_vectors, math_t *eig_vals, + cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; - CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - n_rows, - in, - n_cols, - eig_vals, - &lwork)); + CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, n_rows, in, + n_cols, eig_vals, &lwork)); raft::mr::device::buffer d_work(allocator, stream, lwork); raft::mr::device::buffer d_dev_info(allocator, stream, 1); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - n_rows, - eig_vectors, - n_cols, - eig_vals, - d_work.data(), - lwork, - d_dev_info.data(), - stream)); + CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, + n_cols, eig_vals, d_work.data(), lwork, + d_dev_info.data(), stream)); CUDA_CHECK(cudaGetLastError()); int dev_info; @@ -107,80 +90,39 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; * @{ */ template -void eigSelDC(const raft::handle_t& handle, - math_t* in, - int n_rows, - int n_cols, - int n_eig_vals, - math_t* eig_vectors, - math_t* eig_vals, - EigVecMemUsage memUsage, - cudaStream_t stream) -{ - auto allocator = handle.get_device_allocator(); +void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, + int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, + EigVecMemUsage memUsage, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; int h_meig; - CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, - n_rows, - in, - n_cols, - math_t(0.0), - math_t(0.0), - n_cols - n_eig_vals + 1, - n_cols, - &h_meig, - eig_vals, - &lwork)); + CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), + n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); raft::mr::device::buffer d_work(allocator, stream, lwork); raft::mr::device::buffer d_dev_info(allocator, stream, 1); raft::mr::device::buffer d_eig_vectors(allocator, stream, 0); if (memUsage == OVERWRITE_INPUT) { - CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, - n_rows, - in, - n_cols, - math_t(0.0), - math_t(0.0), - n_cols - n_eig_vals + 1, - n_cols, - &h_meig, - eig_vals, - d_work.data(), - lwork, - d_dev_info.data(), - stream)); + CUSOLVER_CHECK(cusolverDnsyevdx( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), + n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork, + d_dev_info.data(), stream)); } else if (memUsage == COPY_INPUT) { d_eig_vectors.resize(n_rows * n_cols, stream); raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, - n_rows, - eig_vectors, - n_cols, - math_t(0.0), - math_t(0.0), - n_cols - n_eig_vals + 1, - n_cols, - &h_meig, - eig_vals, - d_work.data(), - lwork, - d_dev_info.data(), - stream)); + CUSOLVER_CHECK(cusolverDnsyevdx( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0), + math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, + d_work.data(), lwork, d_dev_info.data(), stream)); } CUDA_CHECK(cudaGetLastError()); @@ -193,10 +135,11 @@ void eigSelDC(const raft::handle_t& handle, "This usually occurs when some of the features do not vary enough."); if (memUsage == OVERWRITE_INPUT) { - raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, stream); + raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, + stream); } else if (memUsage == COPY_INPUT) { - raft::matrix::truncZeroOrigin( - d_eig_vectors.data(), n_rows, eig_vectors, n_rows, n_eig_vals, stream); + raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors, + n_rows, n_eig_vals, stream); } } @@ -217,17 +160,10 @@ void eigSelDC(const raft::handle_t& handle, * @{ */ template -void eigJacobi(const raft::handle_t& handle, - const math_t* in, - int n_rows, - int n_cols, - math_t* eig_vectors, - math_t* eig_vals, - cudaStream_t stream, - math_t tol = 1.e-7, - int sweeps = 15) -{ - auto allocator = handle.get_device_allocator(); +void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows, + int n_cols, math_t *eig_vectors, math_t *eig_vals, + cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) { + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; @@ -236,36 +172,23 @@ void eigJacobi(const raft::handle_t& handle, CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps)); int lwork; - CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - n_rows, - eig_vectors, - n_cols, - eig_vals, - &lwork, - syevj_params)); + CUSOLVER_CHECK(cusolverDnsyevj_bufferSize( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, + eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); raft::mr::device::buffer d_work(allocator, stream, lwork); raft::mr::device::buffer dev_info(allocator, stream, 1); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, - n_rows, - eig_vectors, - n_cols, - eig_vals, - d_work.data(), - lwork, - dev_info.data(), - syevj_params, - stream)); + CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, + n_cols, eig_vals, d_work.data(), lwork, + dev_info.data(), syevj_params, stream)); int executed_sweeps; - CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); + CUSOLVER_CHECK( + cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); CUDA_CHECK(cudaGetLastError()); CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh index 097c3ac218..1c6dee562d 100644 --- a/cpp/include/raft/linalg/eltwise.cuh +++ b/cpp/include/raft/linalg/eltwise.cuh @@ -34,17 +34,19 @@ namespace linalg { * @{ */ template -void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) -{ +void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len, + cudaStream_t stream) { raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream); + out, in, len, [scalar] __device__(InType in) { return in + scalar; }, + stream); } template -void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) -{ +void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, + cudaStream_t stream) { raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream); + out, in, len, [scalar] __device__(InType in) { return in * scalar; }, + stream); } /** @} */ @@ -60,46 +62,42 @@ void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, * @{ */ template -void eltwiseAdd( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ +void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len, + cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, + stream); } template -void eltwiseSub( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ +void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len, + cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, + stream); } template -void eltwiseMultiply( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ +void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2, + IdxType len, cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, + stream); } template -void eltwiseDivide( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ +void eltwiseDivide(OutType *out, const InType *in1, const InType *in2, + IdxType len, cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, + stream); } template -void eltwiseDivideCheckZero( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ +void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2, + IdxType len, cudaStream_t stream) { binaryOp( - out, - in1, - in2, - len, + out, in1, in2, len, [] __device__(InType a, InType b) { if (b == InType(0.0)) return InType(0.0); diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index d5942b7446..0a4897cc0b 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -43,53 +43,35 @@ namespace linalg { * @param stream cuda stream */ template -void gemm(const raft::handle_t& handle, - const math_t* a, - int n_rows_a, - int n_cols_a, - const math_t* b, - math_t* c, - int n_rows_c, - int n_cols_c, - cublasOperation_t trans_a, - cublasOperation_t trans_b, - math_t alpha, - math_t beta, - cudaStream_t stream) -{ +void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, + int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, + cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha, + math_t beta, cudaStream_t stream) { cublasHandle_t cublas_h = handle.get_cublas_handle(); - int m = n_rows_c; - int n = n_cols_c; - int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; + int m = n_rows_c; + int n = n_cols_c; + int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; int lda = trans_a == CUBLAS_OP_T ? k : m; int ldb = trans_b == CUBLAS_OP_T ? n : k; int ldc = m; - CUBLAS_CHECK( - cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, + b, ldb, &beta, c, ldc, stream)); } template -void gemm(const raft::handle_t& handle, - const math_t* a, - int n_rows_a, - int n_cols_a, - const math_t* b, - math_t* c, - int n_rows_c, - int n_cols_c, - cublasOperation_t trans_a, - cublasOperation_t trans_b, - cudaStream_t stream) -{ +void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, + int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, + cublasOperation_t trans_a, cublasOperation_t trans_b, + cudaStream_t stream) { math_t alpha = math_t(1); - math_t beta = math_t(0); - gemm( - handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); + math_t beta = math_t(0); + gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, + trans_b, alpha, beta, stream); } /** - * @brief A wrapper for CUBLS GEMM function designed for handling all possible + * @brief A wrapper for CUBLS GEMM function designed for handling all possible * combinations of operand layouts. * It computes the following equation: Z = alpha . X * Y + beta . Z * @tparam T Data type of input/output matrices (float/double) @@ -108,20 +90,9 @@ void gemm(const raft::handle_t& handle, * @param beta scalar */ template -void gemm(const raft::handle_t& handle, - T* z, - T* x, - T* y, - int _M, - int _N, - int _K, - bool isZColMajor, - bool isXColMajor, - bool isYColMajor, - cudaStream_t stream, - T alpha = T(1.0), - T beta = T(0.0)) -{ +void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, + int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor, + cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) { cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t trans_a, trans_b; @@ -148,13 +119,13 @@ void gemm(const raft::handle_t& handle, // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major // layout, trans_b needs to be CUBLAS_OP_N. trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T; - ldb = isYColMajor == true ? _K : _N; + ldb = isYColMajor == true ? _K : _N; - c = z; + c = z; ldc = _M; - M = _M; - N = _N; - K = _K; + M = _M; + N = _N; + K = _K; } else { // Result c is required in row major layout Thus we pick // a = y, b = x and c = a * b = y * x @@ -183,7 +154,7 @@ void gemm(const raft::handle_t& handle, // Set leading dimension appropriately ldb = isXColMajor == true ? _M : _K; - c = z; + c = z; ldc = _N; M = _N; @@ -191,8 +162,8 @@ void gemm(const raft::handle_t& handle, K = _K; } // Actual cuBLAS call - CUBLAS_CHECK( - cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, + b, ldb, &beta, c, ldc, stream)); } } // end namespace linalg diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h index a78480bb21..edd18b3bee 100644 --- a/cpp/include/raft/linalg/gemv.h +++ b/cpp/include/raft/linalg/gemv.h @@ -26,19 +26,9 @@ namespace raft { namespace linalg { template -void gemv(const raft::handle_t& handle, - const math_t* a, - int n_rows, - int n_cols, - const math_t* x, - int incx, - math_t* y, - int incy, - bool trans_a, - math_t alpha, - math_t beta, - cudaStream_t stream) -{ +void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols, + const math_t* x, int incx, math_t* y, int incy, bool trans_a, + math_t alpha, math_t beta, cudaStream_t stream) { cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; @@ -50,47 +40,33 @@ void gemv(const raft::handle_t& handle, // n - number of columns in input matrix // lda - purpose of it to have ability to operate on submatrices of matrix without copying. // If you're not think about it it's always should be equal to m - // lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform - // transpose + // lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform transpose // In Machine Learning: // m - nunmber of columns in design matrix(number of features) // n - number of rows in designed matrix (number of train examples) - int m = n_rows; - int n = n_cols; + int m = n_rows; + int n = n_cols; int lda = trans_a ? m : n; - CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, y, incy, stream)); + CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, + y, incy, stream)); } template -void gemv(const raft::handle_t& handle, - const math_t* a, - int n_rows_a, - int n_cols_a, - const math_t* x, - math_t* y, - bool trans_a, - math_t alpha, - math_t beta, - cudaStream_t stream) -{ +void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, + int n_cols_a, const math_t* x, math_t* y, bool trans_a, math_t alpha, + math_t beta, cudaStream_t stream) { gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } template -void gemv(const raft::handle_t& handle, - const math_t* a, - int n_rows_a, - int n_cols_a, - const math_t* x, - math_t* y, - bool trans_a, - cudaStream_t stream) -{ +void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a, + int n_cols_a, const math_t* x, math_t* y, bool trans_a, + cudaStream_t stream) { math_t alpha = math_t(1); - math_t beta = math_t(0); + math_t beta = math_t(0); gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h index 2086172f5d..cb2e8ed1ab 100644 --- a/cpp/include/raft/linalg/init.h +++ b/cpp/include/raft/linalg/init.h @@ -36,8 +36,7 @@ namespace { * \param [in] stream cuda stream */ template -void range(T* out, int start, int end, cudaStream_t stream) -{ +void range(T *out, int start, int end, cudaStream_t stream) { thrust::counting_iterator first(start); thrust::counting_iterator last = first + (end - start); thrust::device_ptr ptr(out); @@ -54,8 +53,7 @@ void range(T* out, int start, int end, cudaStream_t stream) * \param [in] stream cuda stream */ template -void range(T* out, int n, cudaStream_t stream) -{ +void range(T *out, int n, cudaStream_t stream) { range(out, 0, n, stream); } } // unnamed namespace diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index 39089473e3..b775a1f696 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -16,7 +16,7 @@ #pragma once -// for cmath: +//for cmath: #define _USE_MATH_DEFINES #include @@ -40,14 +40,14 @@ using namespace linalg; namespace spectral { // curandGeneratorNormalX -inline curandStatus_t curandGenerateNormalX( - curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev) -{ +inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, + float *outputPtr, size_t n, + float mean, float stddev) { return curandGenerateNormal(generator, outputPtr, n, mean, stddev); } -inline curandStatus_t curandGenerateNormalX( - curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev) -{ +inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, + double *outputPtr, size_t n, + double mean, double stddev) { return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); } @@ -55,7 +55,7 @@ inline curandStatus_t curandGenerateNormalX( // Helper functions // ========================================================= -/** +/** * @brief Perform Lanczos iteration * Lanczos iteration is performed on a shifted matrix A+shift*I. * @tparam index_type_t the type of data used for indexing. @@ -85,30 +85,25 @@ inline curandStatus_t curandGenerateNormalX( * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration(handle_t const& handle, - sparse_matrix_t const* A, - index_type_t* iter, - index_type_t maxIter, - value_type_t shift, - value_type_t tol, - bool reorthogonalize, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev) -{ +int performLanczosIteration( + handle_t const &handle, sparse_matrix_t const *A, + index_type_t *iter, index_type_t maxIter, value_type_t shift, + value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host, + value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t negOne = -1; - constexpr value_type_t zero = 0; + constexpr value_type_t zero = 0; value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); @@ -122,28 +117,29 @@ int performLanczosIteration(handle_t const& handle, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, - lanczosVecs_dev, + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); + cudaMemcpyDeviceToDevice, stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot( - cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, + lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, + stream)); alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy( - cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, + lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, + beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), + 1, stream)); } // ------------------------------------------------------- @@ -155,121 +151,65 @@ int performLanczosIteration(handle_t const& handle, // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, - lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); + CUDA_TRY(cudaMemcpyAsync( + lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, + lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), - work_dev + (*iter - 1), - sizeof(value_type_t), - cudaMemcpyDeviceToHost, + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, + lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, + lanczosVecs_dev, n, work_dev, 1, &one, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), + sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_T, - n, - *iter, - &one, - lanczosVecs_dev, - n, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - &zero, - work_dev, - 1, - stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - *iter, - &negOne, - lanczosVecs_dev, - n, - work_dev, - 1, - &one, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, + lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, + lanczosVecs_dev, n, work_dev, 1, &one, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, - n, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - alpha_host + (*iter - 1), - stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, + lanczosVecs_dev + IDX(0, *iter - 1, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, + alpha_host + (*iter - 1), stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, - n, - &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), - 1, - lanczosVecs_dev + IDX(0, *iter, n), - 1, - stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), 1, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2( - cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, + beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, + lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaStreamSynchronize(stream)); @@ -277,7 +217,7 @@ int performLanczosIteration(handle_t const& handle, return 0; } -/** +/** * @brief Find Householder transform for 3-dimensional system * Given an input vector v=[x,y,z]', this function finds a * Householder transform P such that P*v is a multiple of @@ -295,8 +235,8 @@ int performLanczosIteration(handle_t const& handle, * matrix. Matrix dimensions are 3 x 3. */ template -static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) -{ +static void findHouseholder3(value_type_t *v, value_type_t *Pv, + value_type_t *P) { // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -306,7 +246,8 @@ static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) v[0] -= *Pv; // Normalize Householder vector - value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + value_type_t normHouseholder = + std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; v[1] /= normHouseholder; @@ -320,13 +261,11 @@ static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) // Construct Householder matrix index_type_t i, j; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) - P[IDX(i, j, 3)] = -2 * v[i] * v[j]; - for (i = 0; i < 3; ++i) - P[IDX(i, i, 3)] += 1; + for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; } -/** +/** * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix * The Householder transform is pre-applied to the top three rows * of the matrix and post-applied to the left three columns. The @@ -338,8 +277,7 @@ static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ template -static void applyHouseholder3(const value_type_t* v, value_type_t* A) -{ +static void applyHouseholder3(const value_type_t *v, value_type_t *A) { // Loop indices index_type_t i, j; // Dot product between Householder vector and matrix row/column @@ -348,23 +286,19 @@ static void applyHouseholder3(const value_type_t* v, value_type_t* A) // Pre-apply Householder transform for (j = 0; j < 4; ++j) { vDotA = 0; - for (i = 0; i < 3; ++i) - vDotA += v[i] * A[IDX(i, j, 4)]; - for (i = 0; i < 3; ++i) - A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; } // Post-apply Householder transform for (i = 0; i < 4; ++i) { vDotA = 0; - for (j = 0; j < 3; ++j) - vDotA += A[IDX(i, j, 4)] * v[j]; - for (j = 0; j < 3; ++j) - A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; } } -/** +/** * @brief Perform one step of Francis QR algorithm * Equivalent to two steps of the classical QR algorithm on a * tridiagonal matrix. @@ -385,14 +319,10 @@ static void applyHouseholder3(const value_type_t* v, value_type_t* A) * @return Zero if successful. Otherwise non-zero. */ template -static int francisQRIteration(index_type_t n, - value_type_t shift1, - value_type_t shift2, - value_type_t* alpha, - value_type_t* beta, - value_type_t* V, - value_type_t* work) -{ +static int francisQRIteration(index_type_t n, value_type_t shift1, + value_type_t shift2, value_type_t *alpha, + value_type_t *beta, value_type_t *V, + value_type_t *work) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- @@ -422,30 +352,30 @@ static int francisQRIteration(index_type_t n, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, householderMatrix); + findHouseholder3(householder, &temp, + householderMatrix); // Apply initial Householder transform to create bulge memset(bulge, 0, 16 * sizeof(value_type_t)); - for (i = 0; i < 4; ++i) - bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; for (i = 0; i < 3; ++i) { bulge[IDX(i + 1, i, 4)] = beta[i]; bulge[IDX(i, i + 1, 4)] = beta[i]; } applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, + 3, 0, work, n); memcpy(V, work, 3 * n * sizeof(value_type_t)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; + alpha[pos] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) - bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = beta[pos + 3]; @@ -455,22 +385,22 @@ static int francisQRIteration(index_type_t n, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, householderMatrix); + findHouseholder3(householder, beta + pos, + householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), + n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); } // Apply penultimate Householder transform // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; + alpha[n - 4] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) - bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = 0; @@ -478,36 +408,37 @@ static int francisQRIteration(index_type_t n, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, householderMatrix); + findHouseholder3(householder, beta + n - 4, + householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, + householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); // Apply final Householder transform // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; + alpha[n - 3] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = 0; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) - bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, householderMatrix); + for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, + householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm( - false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); + Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, + householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; return 0; } -/** +/** * @brief Perform implicit restart of Lanczos algorithm * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. * @tparam index_type_t the type of data used for indexing. @@ -543,30 +474,23 @@ static int francisQRIteration(index_type_t n, * @return error flag. */ template -static int lanczosRestart(handle_t const& handle, - index_type_t n, - index_type_t iter, - index_type_t iter_new, - value_type_t* shiftUpper, - value_type_t* shiftLower, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ V_host, - value_type_t* __restrict__ work_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev, - bool smallest_eig) -{ +static int lanczosRestart( + handle_t const &handle, index_type_t n, index_type_t iter, + index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower, + value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, bool smallest_eig) { // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants constexpr value_type_t zero = 0; - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Loop index index_type_t i; @@ -577,12 +501,12 @@ static int lanczosRestart(handle_t const& handle, index_type_t restartSteps = iter - iter_new; // Ritz values from Lanczos method - value_type_t* ritzVals_host = work_host + 3 * iter; + value_type_t *ritzVals_host = work_host + 3 * iter; // Shifts for implicit restart - value_type_t* shifts_host; + value_type_t *shifts_host; // Orthonormal matrix for similarity transform - value_type_t* V_dev = work_dev + n * iter; + value_type_t *V_dev = work_dev + n * iter; // ------------------------------------------------------- // Implementation @@ -600,8 +524,7 @@ static int lanczosRestart(handle_t const& handle, // Initialize similarity transform with identity matrix memset(V_host, 0, iter * iter * sizeof(value_type_t)); - for (i = 0; i < iter; ++i) - V_host[IDX(i, i, iter)] = 1; + for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1; // Determine interval to suppress eigenvalues if (smallest_eig) { @@ -625,71 +548,49 @@ static int lanczosRestart(handle_t const& handle, // Calculate Chebyshev nodes as shifts shifts_host = ritzVals_host; for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] = + cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } // Apply Francis QR algorithm to implicitly restart Lanczos for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration( - iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) + if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host, + beta_host, V_host, work_host)) WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY(cudaMemcpyAsync( - V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); - - beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - CUBLAS_CHECK(cublasgemv(cublas_h, - CUBLAS_OP_N, - n, - iter, - beta_host + iter_new - 1, - lanczosVecs_dev, - n, - V_dev + IDX(0, iter_new, iter), - 1, - beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), - 1, - stream)); + CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t), + cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = + beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + CUBLAS_CHECK(cublasgemv( + cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev, + n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), 1, stream)); // Obtain new Lanczos vectors - CUBLAS_CHECK(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - iter_new, - iter, - &one, - lanczosVecs_dev, - n, - V_dev, - iter, - &zero, - work_dev, - n, - stream)); - - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, - work_dev, + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter, + &one, lanczosVecs_dev, n, V_dev, iter, &zero, + work_dev, n, stream)); + + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, n * iter_new * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); + cudaMemcpyDeviceToDevice, stream)); // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), - lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, - stream)); + CUDA_TRY(cudaMemcpyAsync( + lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); - CUBLAS_CHECK(cublasnrm2( - cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, + beta_host + iter_new - 1, stream)); auto h_beta = 1 / beta_host[iter_new - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, + lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } @@ -700,7 +601,7 @@ static int lanczosRestart(handle_t const& handle, // Eigensolver // ========================================================= -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -750,28 +651,19 @@ static int lanczosRestart(handle_t const& handle, * @return error flag. */ template -int computeSmallestEigenvectors(handle_t const& handle, - sparse_matrix_t const* A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t* effIter, - index_type_t* totalIter, - value_type_t* shift, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed) -{ +int computeSmallestEigenvectors( + handle_t const &handle, sparse_matrix_t const *A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t *effIter, + index_type_t *totalIter, value_type_t *shift, + value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -791,20 +683,21 @@ int computeSmallestEigenvectors(handle_t const& handle, index_type_t i; // Host memory - value_type_t* Z_host; // Eigenvectors in Lanczos basis - value_type_t* work_host; // Workspace + value_type_t *Z_host; // Eigenvectors in Lanczos basis + value_type_t *work_host; // Workspace // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -817,11 +710,12 @@ int computeSmallestEigenvectors(handle_t const& handle, std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -844,18 +738,10 @@ int computeSmallestEigenvectors(handle_t const& handle, // Obtain tridiagonal matrix with Lanczos *effIter = 0; - *shift = 0; - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0.0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + *shift = 0; + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue @@ -870,17 +756,9 @@ int computeSmallestEigenvectors(handle_t const& handle, // Obtain tridiagonal matrix with Lanczos *effIter = 0; - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -897,19 +775,9 @@ int computeSmallestEigenvectors(handle_t const& handle, if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart(handle, - n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - true); + status = lanczosRestart( + handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, + beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -918,17 +786,9 @@ int computeSmallestEigenvectors(handle_t const& handle, // Proceed with Lanczos method - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), + reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -939,59 +799,39 @@ int computeSmallestEigenvectors(handle_t const& handle, } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, + (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, + (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, work_host); // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) - work_host[i + 2 * (*effIter)] -= *shift; - for (i = *effIter; i < nEigVecs; ++i) - work_host[i + 2 * (*effIter)] = 0; + for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, - work_host + 2 * (*effIter), + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter), nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); + cudaMemcpyHostToDevice, stream)); - CUDA_TRY(cudaMemcpyAsync(work_dev, - Z_host, + CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); + cudaMemcpyHostToDevice, stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n, - stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, + *effIter, &one, lanczosVecs_dev, n, work_dev, + *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -1029,25 +869,20 @@ int computeSmallestEigenvectors(handle_t const& handle, * @return error flag. */ template -int computeSmallestEigenvectors(handle_t const& handle, - sparse_matrix_t const& A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t& iter, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed = 1234567) -{ +int computeSmallestEigenvectors( + handle_t const &handle, sparse_matrix_t const &A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t &iter, + value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { using namespace spectral; // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -1057,8 +892,8 @@ int computeSmallestEigenvectors(handle_t const& handle, std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t* alpha_host = alpha_host_v.data(); - value_type_t* beta_host = beta_host_v.data(); + value_type_t *alpha_host = alpha_host_v.data(); + value_type_t *beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); @@ -1066,23 +901,10 @@ int computeSmallestEigenvectors(handle_t const& handle, // Perform Lanczos method index_type_t effIter; value_type_t shift; - int status = computeSmallestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - &shift, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); + int status = computeSmallestEigenvectors( + handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, + &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), + eigVals_dev, eigVecs_dev, seed); // Clean up and return return status; @@ -1092,7 +914,7 @@ int computeSmallestEigenvectors(handle_t const& handle, // Eigensolver // ========================================================= -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -1137,27 +959,19 @@ int computeSmallestEigenvectors(handle_t const& handle, * @return error flag. */ template -int computeLargestEigenvectors(handle_t const& handle, - sparse_matrix_t const* A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t* effIter, - index_type_t* totalIter, - value_type_t* __restrict__ alpha_host, - value_type_t* __restrict__ beta_host, - value_type_t* __restrict__ lanczosVecs_dev, - value_type_t* __restrict__ work_dev, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed) -{ +int computeLargestEigenvectors( + handle_t const &handle, sparse_matrix_t const *A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t *effIter, + index_type_t *totalIter, value_type_t *__restrict__ alpha_host, + value_type_t *__restrict__ beta_host, + value_type_t *__restrict__ lanczosVecs_dev, + value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -1173,8 +987,8 @@ int computeLargestEigenvectors(handle_t const& handle, index_type_t i; // Host memory - value_type_t* Z_host; // Eigenvectors in Lanczos basis - value_type_t* work_host; // Workspace + value_type_t *Z_host; // Eigenvectors in Lanczos basis + value_type_t *work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled @@ -1184,14 +998,15 @@ int computeLargestEigenvectors(handle_t const& handle, // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -1204,11 +1019,12 @@ int computeLargestEigenvectors(handle_t const& handle, std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1228,21 +1044,13 @@ int computeLargestEigenvectors(handle_t const& handle, CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos - *effIter = 0; + *effIter = 0; value_type_t shift_val = 0.0; - value_type_t* shift = &shift_val; - - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - 0, - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + value_type_t *shift = &shift_val; + + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, + beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -1259,19 +1067,9 @@ int computeLargestEigenvectors(handle_t const& handle, if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart(handle, - n, - *effIter, - iter_new, - &shiftUpper, - &shiftLower, - alpha_host, - beta_host, - Z_host, - work_host, - lanczosVecs_dev, - work_dev, - false); + status = lanczosRestart( + handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, + beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -1280,17 +1078,9 @@ int computeLargestEigenvectors(handle_t const& handle, // Proceed with Lanczos method - status = performLanczosIteration(handle, - A, - effIter, - maxIter_curr, - *shift, - tol * fabs(shiftLower), - reorthogonalize, - alpha_host, - beta_host, - lanczosVecs_dev, - work_dev); + status = performLanczosIteration( + handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), + reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -1300,18 +1090,15 @@ int computeLargestEigenvectors(handle_t const& handle, WARNING("implicitly restarted Lanczos failed to converge"); } for (int i = 0; i < restartIter; ++i) { - for (int j = 0; j < restartIter; ++j) - Z_host[i * restartIter + j] = 0; + for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', - *effIter, - work_host + 2 * (*effIter), - work_host + 3 * (*effIter), - Z_host, - *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, + (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, + (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), + work_host + 3 * (*effIter), Z_host, *effIter, work_host); // note: We need to pick the top nEigVecs eigenvalues @@ -1336,52 +1123,36 @@ int computeLargestEigenvectors(handle_t const& handle, //} // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) - work_host[i + 2 * (*effIter)] -= *shift; + for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; for (i = 0; i < top_eigenparis_idx_offset; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory // skip smallest eigenvalue if needed - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, - work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); + CUDA_TRY(cudaMemcpyAsync( + eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); // skip smallest eigenvector if needed CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, - stream)); + cudaMemcpyHostToDevice, stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - nEigVecs, - *effIter, - &one, - lanczosVecs_dev, - n, - work_dev, - *effIter, - &zero, - eigVecs_dev, - n, - stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, + *effIter, &one, lanczosVecs_dev, n, work_dev, + *effIter, &zero, eigVecs_dev, n, stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -1419,23 +1190,18 @@ int computeLargestEigenvectors(handle_t const& handle, * @return error flag. */ template -int computeLargestEigenvectors(handle_t const& handle, - sparse_matrix_t const& A, - index_type_t nEigVecs, - index_type_t maxIter, - index_type_t restartIter, - value_type_t tol, - bool reorthogonalize, - index_type_t& iter, - value_type_t* __restrict__ eigVals_dev, - value_type_t* __restrict__ eigVecs_dev, - unsigned long long seed = 123456) -{ +int computeLargestEigenvectors( + handle_t const &handle, sparse_matrix_t const &A, + index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, + value_type_t tol, bool reorthogonalize, index_type_t &iter, + value_type_t *__restrict__ eigVals_dev, + value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, + "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -1445,30 +1211,18 @@ int computeLargestEigenvectors(handle_t const& handle, std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t* alpha_host = alpha_host_v.data(); - value_type_t* beta_host = beta_host_v.data(); + value_type_t *alpha_host = alpha_host_v.data(); + value_type_t *beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method index_type_t effIter; - int status = computeLargestEigenvectors(handle, - &A, - nEigVecs, - maxIter, - restartIter, - tol, - reorthogonalize, - &effIter, - &iter, - alpha_host, - beta_host, - lanczosVecs_dev.raw(), - work_dev.raw(), - eigVals_dev, - eigVecs_dev, - seed); + int status = computeLargestEigenvectors( + handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, + &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), + eigVals_dev, eigVecs_dev, seed); // Clean up and return return status; diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index 200818fdc3..aff08da2d3 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -24,18 +24,21 @@ namespace raft { namespace linalg { -template -__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args) -{ +template +__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in, + Args... args) { auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { out[idx] = map(in[idx], args[idx]...); } + if (idx < len) { + out[idx] = map(in[idx], args[idx]...); + } } -template -void mapImpl( - OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) -{ +template +void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, + const InType *in, Args... args) { const int nblks = raft::ceildiv(len, (size_t)TPB); mapKernel <<>>(out, len, map, in, args...); @@ -57,14 +60,12 @@ void mapImpl( * @param args additional input arrays */ -template -void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) -{ - mapImpl(out, len, map, stream, in, args...); +void map(OutType *out, size_t len, MapOp map, cudaStream_t stream, + const InType *in, Args... args) { + mapImpl(out, len, map, stream, in, + args...); } } // namespace linalg diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh index 78a7017c5c..f2f198670a 100644 --- a/cpp/include/raft/linalg/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/map_then_reduce.cuh @@ -24,66 +24,50 @@ namespace raft { namespace linalg { -struct sum_tag { -}; +struct sum_tag {}; template -__device__ void reduce(OutType* out, const InType acc, sum_tag) -{ +__device__ void reduce(OutType *out, const InType acc, sum_tag) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Sum(acc); - if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); } + if (threadIdx.x == 0) { + raft::myAtomicAdd(out, tmp); + } } template -__device__ void reduce(OutType* out, const InType acc, ReduceLambda op) -{ +__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Reduce(acc, op); - if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); } + if (threadIdx.x == 0) { + raft::myAtomicReduce(out, tmp, op); + } } -template -__global__ void mapThenReduceKernel(OutType* out, - size_t len, - OutType neutral, - MapOp map, - ReduceLambda op, - const InType* in, - Args... args) -{ +template +__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral, + MapOp map, ReduceLambda op, + const InType *in, Args... args) { OutType acc = neutral; - auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { acc = map(in[idx], args[idx]...); } + if (idx < len) { + acc = map(in[idx], args[idx]...); + } __syncthreads(); reduce(out, acc, op); } -template -void mapThenReduceImpl(OutType* out, - size_t len, - OutType neutral, - MapOp map, - ReduceLambda op, - cudaStream_t stream, - const InType* in, - Args... args) -{ +template +void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, + ReduceLambda op, cudaStream_t stream, const InType *in, + Args... args) { raft::update_device(out, &neutral, 1, stream); const int nblks = raft::ceildiv(len, (size_t)TPB); mapThenReduceKernel @@ -105,14 +89,10 @@ void mapThenReduceImpl(OutType* out, * @param args additional input arrays */ -template -void mapThenSumReduce( - OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) -{ +void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, + const InType *in, Args... args) { mapThenReduceImpl( out, len, (OutType)0, map, sum_tag(), stream, in, args...); } @@ -135,21 +115,11 @@ void mapThenSumReduce( * @param args additional input arrays */ -template -void mapThenReduce(OutType* out, - size_t len, - OutType neutral, - MapOp map, - ReduceLambda op, - cudaStream_t stream, - const InType* in, - Args... args) -{ +template +void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map, + ReduceLambda op, cudaStream_t stream, const InType *in, + Args... args) { mapThenReduceImpl( out, len, neutral, map, op, stream, in, args...); } diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 98b5eaa809..902816418f 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -23,15 +23,10 @@ namespace raft { namespace linalg { template -__global__ void matrixVectorOpKernel(Type* out, - const Type* matrix, - const Type* vector, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op) -{ +__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, + const Type *vector, IdxType D, IdxType N, + bool rowMajor, bool bcastAlongRows, + Lambda op) { typedef TxN_t VecType; IdxType len = N * D; IdxType idx = threadIdx.x; @@ -62,21 +57,17 @@ __global__ void matrixVectorOpKernel(Type* out, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type* out, - const Type* matrix, - const Type* vec, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op, - cudaStream_t stream) -{ - IdxType len = N * D; - IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); +template +void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op, cudaStream_t stream) { + IdxType len = N * D; + IdxType nblks = + raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op); + <<>>(out, matrix, vec, D, N, rowMajor, + bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -98,18 +89,11 @@ void matrixVectorOpImpl(Type* out, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type* out, - const Type* matrix, - const Type* vec, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op, - cudaStream_t stream) -{ +void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, + IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, + cudaStream_t stream) { IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); + size_t bytes = stride * sizeof(Type); if (16 / sizeof(Type) && bytes % 16 == 0) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); @@ -134,16 +118,10 @@ void matrixVectorOp(Type* out, ///@todo: come up with a cleaner interface to support these cases in future! template -__global__ void matrixVectorOpKernel(Type* out, - const Type* matrix, - const Type* vector1, - const Type* vector2, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op) -{ +__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, + const Type *vector1, const Type *vector2, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op) { typedef TxN_t VecType; IdxType len = N * D; IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; @@ -176,21 +154,15 @@ __global__ void matrixVectorOpKernel(Type* out, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type* out, - const Type* matrix, - const Type* vec1, - const Type* vec2, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op, - cudaStream_t stream) -{ +template +void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, + const Type *vec2, IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op, cudaStream_t stream) { IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op); + <<>>(out, matrix, vec1, vec2, D, N, rowMajor, + bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -213,19 +185,11 @@ void matrixVectorOpImpl(Type* out, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type* out, - const Type* matrix, - const Type* vec1, - const Type* vec2, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op, - cudaStream_t stream) -{ +void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, + const Type *vec2, IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Lambda op, cudaStream_t stream) { IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); + size_t bytes = stride * sizeof(Type); if (16 / sizeof(Type) && bytes % 16 == 0) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh index a3fcc5bac6..9d1538c172 100644 --- a/cpp/include/raft/linalg/mean_squared_error.cuh +++ b/cpp/include/raft/linalg/mean_squared_error.cuh @@ -24,7 +24,7 @@ namespace linalg { /** * @brief CUDA version mean squared error function mean((A-B)**2) * @tparam math_t data-type upon which the math operation will be performed - * @tparam TPB threads-per-block + * @tparam TPB threads-per-block * @param out the output mean squared error value (assumed to be a device pointer) * @param A input array (assumed to be a device pointer) * @param B input array (assumed to be a device pointer) @@ -33,14 +33,14 @@ namespace linalg { * @param stream cuda-stream where to launch this kernel */ template -void meanSquaredError( - math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) -{ +void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len, + math_t weight, cudaStream_t stream) { auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { math_t diff = a - b; return diff * diff * weight / len; }; - mapThenSumReduce(out, len, sq_diff, stream, A, B); + mapThenSumReduce(out, len, sq_diff, stream, A, + B); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh index 53d57ecd00..ce948c927d 100644 --- a/cpp/include/raft/linalg/multiply.cuh +++ b/cpp/include/raft/linalg/multiply.cuh @@ -33,10 +33,11 @@ namespace linalg { * @{ */ template -void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) -{ +void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, + cudaStream_t stream) { unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); + out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, + stream); } /** @} */ diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh index 82558c8023..64930a7123 100644 --- a/cpp/include/raft/linalg/norm.cuh +++ b/cpp/include/raft/linalg/norm.cuh @@ -44,46 +44,22 @@ enum NormType { L1Norm = 0, L2Norm }; * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void rowNorm(Type* dots, - const Type* data, - IdxType D, - IdxType N, - NormType type, - bool rowMajor, - cudaStream_t stream, - Lambda fin_op = raft::Nop()) -{ +template > +void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, + bool rowMajor, cudaStream_t stream, + Lambda fin_op = raft::Nop()) { switch (type) { case L1Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - true, - stream, - false, - raft::L1Op(), - raft::Sum(), - fin_op); + reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, + raft::L1Op(), raft::Sum(), fin_op); break; case L2Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - true, - stream, - false, - raft::L2Op(), - raft::Sum(), - fin_op); + reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, + raft::L2Op(), raft::Sum(), fin_op); break; - default: ASSERT(false, "Invalid norm type passed! [%d]", type); + default: + ASSERT(false, "Invalid norm type passed! [%d]", type); }; } @@ -101,46 +77,22 @@ void rowNorm(Type* dots, * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void colNorm(Type* dots, - const Type* data, - IdxType D, - IdxType N, - NormType type, - bool rowMajor, - cudaStream_t stream, - Lambda fin_op = raft::Nop()) -{ +template > +void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, + bool rowMajor, cudaStream_t stream, + Lambda fin_op = raft::Nop()) { switch (type) { case L1Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - false, - stream, - false, - raft::L1Op(), - raft::Sum(), - fin_op); + reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, + raft::L1Op(), raft::Sum(), fin_op); break; case L2Norm: - reduce(dots, - data, - D, - N, - (Type)0, - rowMajor, - false, - stream, - false, - raft::L2Op(), - raft::Sum(), - fin_op); + reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, + raft::L2Op(), raft::Sum(), fin_op); break; - default: ASSERT(false, "Invalid norm type passed! [%d]", type); + default: + ASSERT(false, "Invalid norm type passed! [%d]", type); }; } diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index c2455ac3a8..cafa8d54f1 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -40,19 +40,15 @@ namespace linalg { * @{ */ template -void qrGetQ(const raft::handle_t& handle, - const math_t* M, - math_t* Q, - int n_rows, - int n_cols, - cudaStream_t stream) -{ - auto allocator = handle.get_device_allocator(); +void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, + int n_rows, int n_cols, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; int k = min(m, n); - CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, + cudaMemcpyDeviceToDevice, stream)); raft::mr::device::buffer tau(allocator, stream, k); CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); @@ -62,16 +58,19 @@ void qrGetQ(const raft::handle_t& handle, CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); raft::mr::device::buffer workspace(allocator, stream, Lwork); - CUSOLVER_CHECK(cusolverDngeqrf( - cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), + workspace.data(), Lwork, devInfo.data(), + stream)); /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); #endif - CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); + CUSOLVER_CHECK( + cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr( - cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(), + workspace.data(), Lwork, devInfo.data(), + stream)); } /** @@ -85,41 +84,30 @@ void qrGetQ(const raft::handle_t& handle, * @param stream cuda stream */ template -void qrGetQR(const raft::handle_t& handle, - math_t* M, - math_t* Q, - math_t* R, - int n_rows, - int n_cols, - cudaStream_t stream) -{ - auto allocator = handle.get_device_allocator(); +void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, + int n_rows, int n_cols, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; raft::mr::device::buffer R_full(allocator, stream, m * n); raft::mr::device::buffer tau(allocator, stream, min(m, n)); - CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); - int R_full_nrows = m, R_full_ncols = n; CUDA_CHECK( - cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); + cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); + int R_full_nrows = m, R_full_ncols = n; + CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, + cudaMemcpyDeviceToDevice, stream)); int Lwork; raft::mr::device::buffer devInfo(allocator, stream, 1); - CUSOLVER_CHECK(cusolverDngeqrf_bufferSize( - cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); + CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, + R_full_ncols, R_full.data(), + R_full_nrows, &Lwork)); raft::mr::device::buffer workspace(allocator, stream, Lwork); - CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, - R_full_nrows, - R_full_ncols, - R_full.data(), - R_full_nrows, - tau.data(), - workspace.data(), - Lwork, - devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDngeqrf( + cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, + tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); @@ -127,24 +115,17 @@ void qrGetQR(const raft::handle_t& handle, raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); - CUDA_CHECK( - cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, + cudaMemcpyDeviceToDevice, stream)); int Q_nrows = m, Q_ncols = n; - CUSOLVER_CHECK(cusolverDnorgqr_bufferSize( - cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork)); + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols, + min(Q_ncols, Q_nrows), Q, Q_nrows, + tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, - Q_nrows, - Q_ncols, - min(Q_ncols, Q_nrows), - Q, - Q_nrows, - tau.data(), - workspace.data(), - Lwork, - devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDnorgqr( + cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), + workspace.data(), Lwork, devInfo.data(), stream)); } /** @} */ diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index 693a797db9..d39577bbdd 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -52,33 +52,28 @@ namespace linalg { * @param reduce_op binary reduction operation * @param final_op elementwise operation to apply before storing results */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void reduce(OutType* dots, - const InType* data, - int D, - int N, - OutType init, - bool rowMajor, - bool alongRows, - cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void reduce(OutType *dots, const InType *data, int D, int N, OutType init, + bool rowMajor, bool alongRows, cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) -{ + FinalLambda final_op = raft::Nop()) { if (rowMajor && alongRows) { - coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, + reduce_op, final_op); } else if (rowMajor && !alongRows) { - stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + stridedReduction(dots, data, D, N, init, stream, inplace, main_op, + reduce_op, final_op); } else if (!rowMajor && alongRows) { - stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); + stridedReduction(dots, data, N, D, init, stream, inplace, main_op, + reduce_op, final_op); } else { - coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); + coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, + reduce_op, final_op); } } diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh index f931c976fd..bba652e137 100644 --- a/cpp/include/raft/linalg/strided_reduction.cuh +++ b/cpp/include/raft/linalg/strided_reduction.cuh @@ -28,15 +28,14 @@ namespace linalg { // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout template -__global__ void stridedSummationKernel( - Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op) -{ +__global__ void stridedSummationKernel(Type *dots, const Type *data, int D, + int N, Type init, MainLambda main_op) { // Thread reduction Type thread_data = Type(init); - int colStart = blockIdx.x * blockDim.x + threadIdx.x; + int colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { int rowStart = blockIdx.y * blockDim.y + threadIdx.y; - int stride = blockDim.y * gridDim.y; + int stride = blockDim.y * gridDim.y; for (int j = rowStart; j < N; j += stride) { int idx = colStart + j * D; thread_data += main_op(data[idx], j); @@ -45,8 +44,8 @@ __global__ void stridedSummationKernel( // Block reduction extern __shared__ char tmp[]; // One element per thread in block - Type* temp = (Type*)tmp; // Cast to desired type - int myidx = threadIdx.x + blockDim.x * threadIdx.y; + Type *temp = (Type *)tmp; // Cast to desired type + int myidx = threadIdx.x + blockDim.x * threadIdx.y; temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { @@ -55,31 +54,24 @@ __global__ void stridedSummationKernel( } // Grid reduction - if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]); + if ((colStart < D) && (threadIdx.y == 0)) + raft::myAtomicAdd(dots + colStart, temp[myidx]); } // Kernel to perform reductions along the strided dimension // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout -template -__global__ void stridedReductionKernel(OutType* dots, - const InType* data, - int D, - int N, - OutType init, - MainLambda main_op, - ReduceLambda reduce_op) -{ +template +__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, + int N, OutType init, MainLambda main_op, + ReduceLambda reduce_op) { // Thread reduction OutType thread_data = init; - IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; + IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y; - IdxType stride = blockDim.y * gridDim.y; + IdxType stride = blockDim.y * gridDim.y; for (IdxType j = rowStart; j < N; j += stride) { IdxType idx = colStart + j * D; thread_data = reduce_op(thread_data, main_op(data[idx], j)); @@ -87,13 +79,14 @@ __global__ void stridedReductionKernel(OutType* dots, } // Block reduction - extern __shared__ char tmp[]; // One element per thread in block - auto* temp = (OutType*)tmp; // Cast to desired type + extern __shared__ char tmp[]; // One element per thread in block + auto *temp = (OutType *)tmp; // Cast to desired type IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y); - temp[myidx] = thread_data; + temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { - if (threadIdx.y < j) temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); + if (threadIdx.y < j) + temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); __syncthreads(); } @@ -129,23 +122,15 @@ __global__ void stridedReductionKernel(OutType* dots, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void stridedReduction(OutType* dots, - const InType* data, - IdxType D, - IdxType N, - OutType init, - cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, + OutType init, cudaStream_t stream, bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) -{ + FinalLambda final_op = raft::Nop()) { ///@todo: this extra should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) if (!inplace) @@ -155,7 +140,7 @@ void stridedReduction(OutType* dots, // Arbitrary numbers for now, probably need to tune const dim3 thrds(32, 16); IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); - elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; + elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; @@ -168,7 +153,8 @@ void stridedReduction(OutType* dots, <<>>(dots, data, D, N, init, main_op); else stridedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op); + <<>>(dots, data, D, N, init, main_op, + reduce_op); ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh index 43060d0818..882c105689 100644 --- a/cpp/include/raft/linalg/subtract.cuh +++ b/cpp/include/raft/linalg/subtract.cuh @@ -38,8 +38,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) -{ +void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, + cudaStream_t stream) { auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; unaryOp(out, in, len, op, stream); } @@ -58,25 +58,24 @@ void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStrea * @param stream cuda stream where to launch work */ template -void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) -{ +void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len, + cudaStream_t stream) { auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void subtract_dev_scalar_kernel(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len) -{ - // TODO: kernel do not use shared memory in current implementation +__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, + IdxType len) { + //TODO: kernel do not use shared memory in current implementation int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; } + if (i < len) { + outDev[i] = inDev[i] - *singleScalarDev; + } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and - * write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -87,12 +86,9 @@ __global__ void subtract_dev_scalar_kernel(math_t* outDev, * @remark block size has not been tuned */ template -void subtractDevScalar(math_t* outDev, - const math_t* inDev, - const math_t* singleScalarDev, - IdxType len, - cudaStream_t stream) -{ +void subtractDevScalar(math_t *outDev, const math_t *inDev, + const math_t *singleScalarDev, IdxType len, + cudaStream_t stream) { // Just for the note - there is no way to express such operation with cuBLAS in effective way // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index 1cb8b7592f..7357a68a4c 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -50,21 +50,14 @@ namespace linalg { // TODO: couldn't template this function due to cusolverDnSgesvd and // cusolverSnSgesvd. Check if there is any other way. template -void svdQR(const raft::handle_t& handle, - T* in, - int n_rows, - int n_cols, - T* sing_vals, - T* left_sing_vecs, - T* right_sing_vecs, - bool trans_right, - bool gen_left_vec, - bool gen_right_vec, - cudaStream_t stream) -{ - std::shared_ptr allocator = handle.get_device_allocator(); - cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); +void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, + T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, + bool trans_right, bool gen_left_vec, bool gen_right_vec, + cudaStream_t stream) { + std::shared_ptr allocator = + handle.get_device_allocator(); + cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000 // 46340: sqrt of max int value @@ -79,13 +72,14 @@ void svdQR(const raft::handle_t& handle, const int n = n_cols; raft::mr::device::buffer devInfo(allocator, stream, 1); - T* d_rwork = nullptr; + T *d_rwork = nullptr; int lwork = 0; - CUSOLVER_CHECK(cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); + CUSOLVER_CHECK( + cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); raft::mr::device::buffer d_work(allocator, stream, lwork); - char jobu = 'S'; + char jobu = 'S'; char jobvt = 'A'; if (!gen_left_vec) { @@ -98,23 +92,9 @@ void svdQR(const raft::handle_t& handle, strcpy(&jobvt, &new_vt); } - CUSOLVER_CHECK(cusolverDngesvd(cusolverH, - jobu, - jobvt, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - d_work.data(), - lwork, - d_rwork, - devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDngesvd( + cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m, + right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream)); // Transpose the right singular vector back if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream); @@ -130,37 +110,19 @@ void svdQR(const raft::handle_t& handle, } template -void svdEig(const raft::handle_t& handle, - T* in, - int n_rows, - int n_cols, - T* S, - T* U, - T* V, - bool gen_left_vec, - cudaStream_t stream) -{ - auto allocator = handle.get_device_allocator(); +void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, + T *U, T *V, bool gen_left_vec, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); int len = n_cols * n_cols; raft::mr::device::buffer in_cross_mult(allocator, stream, len); T alpha = T(1); - T beta = T(0); - raft::linalg::gemm(handle, - in, - n_rows, - n_cols, - in, - in_cross_mult.data(), - n_cols, - n_cols, - CUBLAS_OP_T, - CUBLAS_OP_N, - alpha, - beta, + T beta = T(0); + raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(), + n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); @@ -171,20 +133,10 @@ void svdEig(const raft::handle_t& handle, raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true); if (gen_left_vec) { - raft::linalg::gemm(handle, - in, - n_rows, - n_cols, - V, - U, - n_rows, - n_cols, - CUBLAS_OP_N, - CUBLAS_OP_N, - alpha, - beta, - stream); - raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, true, stream); + raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols, + CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, + true, stream); } } @@ -206,20 +158,11 @@ void svdEig(const raft::handle_t& handle, * @param stream cuda stream */ template -void svdJacobi(const raft::handle_t& handle, - math_t* in, - int n_rows, - int n_cols, - math_t* sing_vals, - math_t* left_sing_vecs, - math_t* right_sing_vecs, - bool gen_left_vec, - bool gen_right_vec, - math_t tol, - int max_sweeps, - cudaStream_t stream) -{ - auto allocator = handle.get_device_allocator(); +void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, + math_t *sing_vals, math_t *left_sing_vecs, + math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, + math_t tol, int max_sweeps, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); gesvdjInfo_t gesvdj_params = NULL; @@ -234,42 +177,18 @@ void svdJacobi(const raft::handle_t& handle, raft::mr::device::buffer devInfo(allocator, stream, 1); int lwork = 0; - int econ = 1; - - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - econ, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - &lwork, - gesvdj_params)); + int econ = 1; + + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, + left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); raft::mr::device::buffer d_work(allocator, stream, lwork); - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH, - CUSOLVER_EIG_MODE_VECTOR, - econ, - m, - n, - in, - m, - sing_vals, - left_sing_vecs, - m, - right_sing_vecs, - n, - d_work.data(), - lwork, - devInfo.data(), - gesvdj_params, - stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( + cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, + left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(), + gesvdj_params, stream)); CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); } @@ -288,36 +207,18 @@ void svdJacobi(const raft::handle_t& handle, * @param stream cuda stream */ template -void svdReconstruction(const raft::handle_t& handle, - math_t* U, - math_t* S, - math_t* V, - math_t* out, - int n_rows, - int n_cols, - int k, - cudaStream_t stream) -{ +void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, + math_t *V, math_t *out, int n_rows, int n_cols, int k, + cudaStream_t stream) { auto allocator = handle.get_device_allocator(); const math_t alpha = 1.0, beta = 0.0; raft::mr::device::buffer SVT(allocator, stream, k * n_cols); - raft::linalg::gemm( - handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); - raft::linalg::gemm(handle, - U, - n_rows, - k, - SVT.data(), - out, - n_rows, - n_cols, - CUBLAS_OP_N, - CUBLAS_OP_N, - alpha, - beta, - stream); + raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, + CUBLAS_OP_T, alpha, beta, stream); + raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols, + CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); } /** @@ -335,18 +236,10 @@ void svdReconstruction(const raft::handle_t& handle, * @param stream cuda stream */ template -bool evaluateSVDByL2Norm(const raft::handle_t& handle, - math_t* A_d, - math_t* U, - math_t* S_vec, - math_t* V, - int n_rows, - int n_cols, - int k, - math_t tol, - cudaStream_t stream) -{ - auto allocator = handle.get_device_allocator(); +bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, + math_t *S_vec, math_t *V, int n_rows, int n_cols, + int k, math_t tol, cudaStream_t stream) { + auto allocator = handle.get_device_allocator(); cublasHandle_t cublasH = handle.get_cublas_handle(); int m = n_rows, n = n_cols; @@ -370,25 +263,16 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle, // calculate percent error const math_t alpha = 1.0, beta = -1.0; raft::mr::device::buffer A_minus_P(allocator, stream, m * n); - CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); - - CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, - CUBLAS_OP_N, - CUBLAS_OP_N, - m, - n, - &alpha, - A_d, - m, - &beta, - P_d.data(), - m, - A_minus_P.data(), - m, - stream)); - - math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); - math_t percent_error = 100.0 * norm_A_minus_P / normA; + CUDA_CHECK( + cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); + + CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, + &alpha, A_d, m, &beta, P_d.data(), m, + A_minus_P.data(), m, stream)); + + math_t norm_A_minus_P = + raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); + math_t percent_error = 100.0 * norm_A_minus_P / normA; return (percent_error / 100.0 < tol); } diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h index 9b954c29c1..d90f6271fa 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.h @@ -33,34 +33,18 @@ namespace linalg { * @param stream: cuda stream */ template -void transpose(const raft::handle_t& handle, - math_t* in, - math_t* out, - int n_rows, - int n_cols, - cudaStream_t stream) -{ +void transpose(const raft::handle_t &handle, math_t *in, math_t *out, + int n_rows, int n_cols, cudaStream_t stream) { cublasHandle_t cublas_h = handle.get_cublas_handle(); int out_n_rows = n_cols; int out_n_cols = n_rows; const math_t alpha = 1.0; - const math_t beta = 0.0; - CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - out_n_rows, - out_n_cols, - &alpha, - in, - n_rows, - &beta, - out, - out_n_rows, - out, - out_n_rows, - stream)); + const math_t beta = 0.0; + CUBLAS_CHECK(raft::linalg::cublasgeam( + cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in, + n_rows, &beta, out, out_n_rows, out, out_n_rows, stream)); } /** @@ -70,25 +54,24 @@ void transpose(const raft::handle_t& handle, * @param stream: cuda stream */ template -void transpose(math_t* inout, int n, cudaStream_t stream) -{ - auto m = n; - auto size = n * n; - auto d_inout = inout; +void transpose(math_t *inout, int n, cudaStream_t stream) { + auto m = n; + auto size = n * n; + auto d_inout = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each( - thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(int idx) { - int s_row = idx % m; - int s_col = idx / m; - int d_row = s_col; - int d_col = s_row; - if (s_row < s_col) { - auto temp = d_inout[d_col * m + d_row]; - d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; - d_inout[s_col * m + s_row] = temp; - } - }); + thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + [=] __device__(int idx) { + int s_row = idx % m; + int s_col = idx / m; + int d_row = s_col; + int d_col = s_row; + if (s_row < s_col) { + auto temp = d_inout[d_col * m + d_row]; + d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; + d_inout[s_col * m + s_row] = temp; + } + }); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index 198b9b2b10..46b4d296cb 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -23,9 +23,10 @@ namespace raft { namespace linalg { -template -__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op) -{ +template +__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, + Lambda op) { typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a; @@ -41,10 +42,12 @@ __global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambd b.store(out, idx); } -template -void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) -{ - const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, + cudaStream_t stream) { + const IdxType nblks = + raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); unaryOpKernel <<>>(out, in, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -65,38 +68,47 @@ void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStr * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val);` */ -template -void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) -{ - if (len <= 0) return; // silently skip in case of 0 length input - constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t inAddr = uint64_t(in); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { - unaryOpImpl(out, in, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { - unaryOpImpl(out, in, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { - unaryOpImpl(out, in, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { - unaryOpImpl(out, in, len, op, stream); +template +void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op, + cudaStream_t stream) { + if (len <= 0) return; //silently skip in case of 0 length input + constexpr auto maxSize = + sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t inAddr = uint64_t(in); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && + outAddr % 16 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && + outAddr % 8 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && + outAddr % 4 == 0) { + unaryOpImpl( + out, in, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && + outAddr % 2 == 0) { + unaryOpImpl( + out, in, len, op, stream); } else if (1 / maxSize) { - unaryOpImpl(out, in, len, op, stream); + unaryOpImpl( + out, in, len, op, stream); } else { - unaryOpImpl(out, in, len, op, stream); + unaryOpImpl(out, in, len, op, + stream); } } template -__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) -{ +__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - if (idx < len) { op(out + idx, idx); } + if (idx < len) { + op(out + idx, idx); + } } /** @@ -116,12 +128,14 @@ __global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) * where outLocationOffset will be out + idx. * @param[in] stream cuda stream where to launch work */ -template -void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) -{ +template +void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op, + cudaStream_t stream) { if (len <= 0) return; // silently skip in case of 0 length input auto nblks = raft::ceildiv(len, TPB); - writeOnlyUnaryOpKernel<<>>(out, len, op); + writeOnlyUnaryOpKernel + <<>>(out, len, op); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh index 579491b5cc..0a72117140 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.cuh @@ -41,18 +41,14 @@ namespace matrix { * @param stream cuda stream */ template -void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream) -{ - auto d_src = in; +void power(math_t *in, math_t *out, math_t scalar, int len, + cudaStream_t stream) { + auto d_src = in; auto d_dest = out; raft::linalg::binaryOp( - d_dest, - d_src, - d_src, - len, - [=] __device__(math_t a, math_t b) { return scalar * a * b; }, - stream); + d_dest, d_src, d_src, len, + [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream); } /** @@ -63,8 +59,7 @@ void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream) * @param stream cuda stream */ template -void power(math_t* inout, math_t scalar, int len, cudaStream_t stream) -{ +void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { power(inout, inout, scalar, len, stream); } @@ -75,8 +70,7 @@ void power(math_t* inout, math_t scalar, int len, cudaStream_t stream) * @param stream cuda stream */ template -void power(math_t* inout, int len, cudaStream_t stream) -{ +void power(math_t *inout, int len, cudaStream_t stream) { math_t scalar = 1.0; power(inout, scalar, len, stream); } @@ -90,8 +84,7 @@ void power(math_t* inout, int len, cudaStream_t stream) * @{ */ template -void power(math_t* in, math_t* out, int len, cudaStream_t stream) -{ +void power(math_t *in, math_t *out, int len, cudaStream_t stream) { math_t scalar = 1.0; power(in, out, scalar, len, stream); } @@ -108,20 +101,13 @@ void power(math_t* in, math_t* out, int len, cudaStream_t stream) * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot(math_t* in, - math_t* out, - math_t scalar, - IdxType len, - cudaStream_t stream, - bool set_neg_zero = false) -{ - auto d_src = in; +void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, + cudaStream_t stream, bool set_neg_zero = false) { + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, - d_src, - len, + d_dest, d_src, len, [=] __device__(math_t a) { if (set_neg_zero) { if (a < math_t(0)) { @@ -147,9 +133,8 @@ void seqRoot(math_t* in, * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot( - math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false) -{ +void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, + bool set_neg_zero = false) { seqRoot(inout, inout, scalar, len, stream, set_neg_zero); } @@ -163,27 +148,22 @@ void seqRoot( * @param stream cuda stream */ template -void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream) -{ +void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { math_t scalar = 1.0; seqRoot(in, out, scalar, len, stream); } template -void seqRoot(math_t* inout, IdxType len, cudaStream_t stream) -{ +void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) { math_t scalar = 1.0; seqRoot(inout, inout, scalar, len, stream); } template -void setSmallValuesZero( - math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15) -{ +void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, + cudaStream_t stream, math_t thres = 1e-15) { raft::linalg::unaryOp( - out, - in, - len, + out, in, len, [=] __device__(math_t a) { if (a <= thres && -a <= thres) { return math_t(0); @@ -204,8 +184,8 @@ void setSmallValuesZero( * @param thres: threshold */ template -void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15) -{ +void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, + math_t thres = 1e-15) { setSmallValuesZero(inout, inout, len, stream, thres); } @@ -223,21 +203,14 @@ void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t * @{ */ template -void reciprocal(math_t* in, - math_t* out, - math_t scalar, - int len, - cudaStream_t stream, - bool setzero = false, - math_t thres = 1e-15) -{ - auto d_src = in; +void reciprocal(math_t *in, math_t *out, math_t scalar, int len, + cudaStream_t stream, bool setzero = false, + math_t thres = 1e-15) { + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, - d_src, - len, + d_dest, d_src, len, [=] __device__(math_t a) { if (setzero) { if (abs(a) <= thres) { @@ -264,13 +237,8 @@ void reciprocal(math_t* in, * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0) */ template -void reciprocal(math_t* inout, - math_t scalar, - IdxType len, - cudaStream_t stream, - bool setzero = false, - math_t thres = 1e-15) -{ +void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, + bool setzero = false, math_t thres = 1e-15) { reciprocal(inout, inout, scalar, len, stream, setzero, thres); } @@ -283,8 +251,7 @@ void reciprocal(math_t* inout, * @param stream cuda stream */ template -void reciprocal(math_t* inout, IdxType len, cudaStream_t stream) -{ +void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { math_t scalar = 1.0; reciprocal(inout, scalar, len, stream); } @@ -299,15 +266,14 @@ void reciprocal(math_t* inout, IdxType len, cudaStream_t stream) * @param stream cuda stream */ template -void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream) -{ +void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { math_t scalar = 1.0; reciprocal(in, out, scalar, len, stream); } template -void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0) -{ +void setValue(math_t *out, const math_t *in, math_t scalar, int len, + cudaStream_t stream = 0) { raft::linalg::unaryOp( out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream); } @@ -323,44 +289,46 @@ void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_ * @param stream cuda stream */ template -void ratio( - const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream) -{ - auto d_src = src; +void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, + cudaStream_t stream) { + auto d_src = src; auto d_dest = dest; - std::shared_ptr allocator = handle.get_device_allocator(); + std::shared_ptr allocator = + handle.get_device_allocator(); raft::mr::device::buffer d_sum(allocator, stream, 1); - auto* d_sum_ptr = d_sum.data(); - auto no_op = [] __device__(math_t in) { return in; }; + auto *d_sum_ptr = d_sum.data(); + auto no_op = [] __device__(math_t in) { return in; }; raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); raft::linalg::unaryOp( - d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream); + d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, + stream); } /** @} */ // Computes the argmax(d_in) column-wise in a DxN matrix template -__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax) -{ +__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(-1, -raft::myInf()); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx])); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); - if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; } + if (threadIdx.x == 0) { + argmax[blockIdx.x] = maxKV.key; + } } /** @@ -372,8 +340,8 @@ __global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax) * @param stream: cuda stream */ template -void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream) -{ +void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, + cudaStream_t stream) { int D = n_rows; int N = n_cols; if (D <= 32) { @@ -392,29 +360,30 @@ void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by // flipping the sign if the |max| value for each column is negative. template -__global__ void signFlipKernel(T* d_in, int D, int N) -{ +__global__ void signFlipKernel(T *d_in, int D, int N) { typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax (with abs()) index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(0, 0); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx]))); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); // flip column sign if d_in[maxIndex] < 0 __shared__ bool need_sign_flip; - if (threadIdx.x == 0) { need_sign_flip = d_in[maxKV.key] < T(0); } + if (threadIdx.x == 0) { + need_sign_flip = d_in[maxKV.key] < T(0); + } __syncthreads(); if (need_sign_flip) { for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; d_in[idx] = -d_in[idx]; } } @@ -429,10 +398,9 @@ __global__ void signFlipKernel(T* d_in, int D, int N) * @param stream cuda stream */ template -void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) -{ - int D = n_rows; - int N = n_cols; +void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { + int D = n_rows; + int N = n_cols; auto data = inout; if (D <= 32) { signFlipKernel<<>>(data, D, N); @@ -447,43 +415,20 @@ void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) } template -void matrixVectorBinaryMult(Type* data, - const Type* vec, - IdxType n_row, - IdxType n_col, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ +void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a * b; }, - stream); + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a * b; }, stream); } template -void matrixVectorBinaryMultSkipZero(Type* data, - const Type* vec, - IdxType n_row, - IdxType n_col, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ +void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, + bool bcastAlongRows, cudaStream_t stream) { raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, [] __device__(Type a, Type b) { if (b == Type(0)) return a; @@ -494,45 +439,22 @@ void matrixVectorBinaryMultSkipZero(Type* data, } template -void matrixVectorBinaryDiv(Type* data, - const Type* vec, - IdxType n_row, - IdxType n_col, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ +void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a / b; }, - stream); + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a / b; }, stream); } template -void matrixVectorBinaryDivSkipZero(Type* data, - const Type* vec, - IdxType n_row, - IdxType n_col, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream, - bool return_zero = false) -{ +void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, + bool bcastAlongRows, cudaStream_t stream, + bool return_zero = false) { if (return_zero) { raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return Type(0); @@ -542,13 +464,7 @@ void matrixVectorBinaryDivSkipZero(Type* data, stream); } else { raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return a; @@ -560,45 +476,21 @@ void matrixVectorBinaryDivSkipZero(Type* data, } template -void matrixVectorBinaryAdd(Type* data, - const Type* vec, - IdxType n_row, - IdxType n_col, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ +void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, - stream); + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, stream); } template -void matrixVectorBinarySub(Type* data, - const Type* vec, - IdxType n_row, - IdxType n_col, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ +void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row, + IdxType n_col, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { raft::linalg::matrixVectorOp( - data, - data, - vec, - n_col, - n_row, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, - stream); + data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, stream); } }; // end namespace matrix diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh index 71a2888545..5f5755e24e 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.cuh @@ -49,33 +49,29 @@ using namespace std; * @param rowMajor whether the matrix has row major layout */ template -void copyRows(const m_t* in, - idx_t n_rows, - idx_t n_cols, - m_t* out, - const idx_array_t* indices, - idx_t n_rows_indices, - cudaStream_t stream, - bool rowMajor = false) -{ +void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, + const idx_array_t *indices, idx_t n_rows_indices, + cudaStream_t stream, bool rowMajor = false) { if (rowMajor) { const idx_t TPB = 256; - cache::get_vecs<<>>( - in, n_cols, indices, n_rows_indices, out); + cache:: + get_vecs<<>>( + in, n_cols, indices, n_rows_indices, out); CUDA_CHECK(cudaPeekAtLastError()); return; } - idx_t size = n_rows_indices * n_cols; + idx_t size = n_rows_indices * n_cols; auto counting = thrust::make_counting_iterator(0); - thrust::for_each( - thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) { - idx_t row = idx % n_rows_indices; - idx_t col = idx / n_rows_indices; + thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + [=] __device__(idx_t idx) { + idx_t row = idx % n_rows_indices; + idx_t col = idx / n_rows_indices; - out[col * n_rows_indices + row] = in[col * n_rows + indices[row]]; - }); + out[col * n_rows_indices + row] = + in[col * n_rows + indices[row]]; + }); } /** @@ -87,8 +83,8 @@ void copyRows(const m_t* in, * @param stream: cuda stream */ template -void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ +void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols, + cudaStream_t stream) { raft::copy_async(out, in, n_rows * n_cols, stream); } @@ -103,22 +99,21 @@ void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stre * @param stream: cuda stream */ template -void truncZeroOrigin( - m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) -{ - auto m = out_n_rows; - auto k = in_n_rows; - idx_t size = out_n_rows * out_n_cols; - auto d_q = in; +void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, + idx_t out_n_cols, cudaStream_t stream) { + auto m = out_n_rows; + auto k = in_n_rows; + idx_t size = out_n_rows * out_n_cols; + auto d_q = in; auto d_q_trunc = out; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each( - thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) { - idx_t row = idx % m; - idx_t col = idx / m; - d_q_trunc[col * m + row] = d_q[col * k + row]; - }); + thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size, + [=] __device__(idx_t idx) { + idx_t row = idx % m; + idx_t col = idx / m; + d_q_trunc[col * m + row] = d_q[col * k + row]; + }); } /** @@ -130,25 +125,24 @@ void truncZeroOrigin( * @param stream: cuda stream */ template -void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ - auto n = n_cols; - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { + auto n = n_cols; + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each( - thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = dest_row; - idx_t src_col = (n - dest_col) - 1; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + thrust::for_each(thrust::cuda::par.on(stream), counting, + counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = dest_row; + idx_t src_col = (n - dest_col) - 1; + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -160,26 +154,25 @@ void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) * @param stream: cuda stream */ template -void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each( - thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = (m - dest_row) - 1; - ; - idx_t src_col = dest_col; + thrust::for_each(thrust::cuda::par.on(stream), counting, + counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = (m - dest_row) - 1; + ; + idx_t src_col = dest_col; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -191,16 +184,16 @@ void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) * @param v_separator: vertical separator character */ template -void print( - const m_t* in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', char v_separator = '\n') -{ +void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', + char v_separator = '\n') { std::vector h_matrix = std::vector(n_cols * n_rows); - CUDA_CHECK( - cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), + cudaMemcpyDeviceToHost)); for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { - printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator); + printf("%1.4f%c", h_matrix[j * n_rows + i], + j < n_cols - 1 ? h_separator : v_separator); } } } @@ -212,8 +205,7 @@ void print( * @param n_cols: number of columns of input matrix */ template -void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) -{ +void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { printf("%1.4f ", in[j * n_rows + i]); @@ -234,9 +226,8 @@ void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) * (1-based) */ template -__global__ void slice( - m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) -{ +__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, + idx_t y1, idx_t x2, idx_t y2) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t dm = x2 - x1, dn = y2 - y1; if (idx < dm * dn) { @@ -260,16 +251,8 @@ __global__ void slice( * @param stream: cuda stream */ template -void sliceMatrix(m_t* in, - idx_t n_rows, - idx_t n_cols, - m_t* out, - idx_t x1, - idx_t y1, - idx_t x2, - idx_t y2, - cudaStream_t stream) -{ +void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, + idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) { // Slicing dim3 block(64); dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x); @@ -285,13 +268,15 @@ void sliceMatrix(m_t* in, * @param k: min(n_rows, n_cols) */ template -__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k) -{ +__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, + idx_t n_cols, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t m = n_rows, n = n_cols; if (idx < m * n) { idx_t i = idx % m, j = idx / m; - if (i < k && j < k && j >= i) { dst[i + j * k] = src[idx]; } + if (i < k && j < k && j >= i) { + dst[i + j * k] = src[idx]; + } } } @@ -304,8 +289,8 @@ __global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_col * @param stream: cuda stream */ template -void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ +void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, + cudaStream_t stream) { idx_t m = n_rows, n = n_cols; idx_t k = min(m, n); dim3 block(64); @@ -322,11 +307,13 @@ void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStr * @param k: dimensionality */ template -__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k) -{ +__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, + idx_t n, idx_t k) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < k) { matrix[idx + idx * m] = vec[idx]; } + if (idx < k) { + matrix[idx + idx * m] = vec[idx]; + } } /** @@ -338,13 +325,13 @@ __global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t * @param stream: cuda stream */ template -void initializeDiagonalMatrix( - m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) -{ +void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, + cudaStream_t stream) { idx_t k = min(n_rows, n_cols); dim3 block(64); dim3 grid((k + block.x - 1) / block.x); - copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, n_cols, k); + copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, + n_cols, k); } /** @@ -354,10 +341,11 @@ void initializeDiagonalMatrix( * @param len: size of one side of the matrix */ template -__global__ void matrixDiagonalInverse(m_t* in, idx_t len) -{ +__global__ void matrixDiagonalInverse(m_t *in, idx_t len) { idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; } + if (idx < len) { + in[idx + idx * len] = 1.0 / in[idx + idx * len]; + } } /** @@ -367,8 +355,7 @@ __global__ void matrixDiagonalInverse(m_t* in, idx_t len) * @param stream: cuda stream */ template -void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) -{ +void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { dim3 block(64); dim3 grid((len + block.x - 1) / block.x); matrixDiagonalInverse<<>>(in, len); @@ -382,11 +369,12 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) * @param stream: cuda stream */ template -m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream) -{ +m_t getL2Norm(const raft::handle_t &handle, m_t *in, idx_t size, + cudaStream_t stream) { cublasHandle_t cublasH = handle.get_cublas_handle(); - m_t normval = 0; - CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + m_t normval = 0; + CUBLAS_CHECK( + raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); return normval; } diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp index 18c8be5f45..29e0d7cfcd 100644 --- a/cpp/include/raft/mr/buffer_base.hpp +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -35,11 +35,11 @@ namespace mr { template class buffer_base { public: - using size_type = std::size_t; - using value_type = T; - using iterator = value_type*; - using const_iterator = const value_type*; - using reference = T&; + using size_type = std::size_t; + using value_type = T; + using iterator = value_type*; + using const_iterator = const value_type*; + using reference = T&; using const_reference = const T&; buffer_base() = delete; @@ -55,12 +55,16 @@ class buffer_base { * @param[in] stream cuda stream where this allocation operations are async * @param[in] n size of the buffer (in number of elements) */ - buffer_base(std::shared_ptr allocator, cudaStream_t stream, size_type n = 0) - : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator)) - { + buffer_base(std::shared_ptr allocator, cudaStream_t stream, + size_type n = 0) + : data_(nullptr), + size_(n), + capacity_(n), + stream_(stream), + allocator_(std::move(allocator)) { if (capacity_ > 0) { - data_ = - static_cast(allocator_->allocate(capacity_ * sizeof(value_type), stream_)); + data_ = static_cast( + allocator_->allocate(capacity_ * sizeof(value_type), stream_)); CUDA_CHECK(cudaStreamSynchronize(stream_)); } } @@ -94,23 +98,23 @@ class buffer_base { * @param[in] stream cuda stream where allocation operations are queued * @{ */ - void reserve(size_type new_capacity) - { + void reserve(size_type new_capacity) { if (new_capacity > capacity_) { - auto* new_data = - static_cast(allocator_->allocate(new_capacity * sizeof(value_type), stream_)); - if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); } + auto* new_data = static_cast( + allocator_->allocate(new_capacity * sizeof(value_type), stream_)); + if (size_ > 0) { + raft::copy(new_data, data_, size_, stream_); + } // Only deallocate if we have allocated a pointer if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = new_data; + data_ = new_data; capacity_ = new_capacity; } } - void reserve(size_type new_capacity, cudaStream_t stream) - { + void reserve(size_type new_capacity, cudaStream_t stream) { set_stream(stream); reserve(new_capacity); } @@ -123,14 +127,12 @@ class buffer_base { * @param[in] stream cuda stream where the work will be queued * @{ */ - void resize(const size_type new_size) - { + void resize(const size_type new_size) { reserve(new_size); size_ = new_size; } - void resize(const size_type new_size, cudaStream_t stream) - { + void resize(const size_type new_size, cudaStream_t stream) { set_stream(stream); resize(new_size); } @@ -144,18 +146,16 @@ class buffer_base { * @param[in] stream cuda stream where the work will be queued * @{ */ - void release() - { + void release() { if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = nullptr; + data_ = nullptr; capacity_ = 0; - size_ = 0; + size_ = 0; } - void release(cudaStream_t stream) - { + void release(cudaStream_t stream) { set_stream(stream); release(); } @@ -195,8 +195,7 @@ class buffer_base { * @param[in] stream new cuda stream to be set. If it is the same as the * current one, then this method will be a no-op. */ - void set_stream(cudaStream_t stream) - { + void set_stream(cudaStream_t stream) { if (stream_ != stream) { cudaEvent_t event; CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp index e930b617e0..889e1640db 100644 --- a/cpp/include/raft/mr/device/allocator.hpp +++ b/cpp/include/raft/mr/device/allocator.hpp @@ -32,20 +32,17 @@ namespace device { * further to the ones listed in `Allocator`: * - Allocations may be always on the device that was specified on construction. */ -class allocator : public base_allocator { -}; +class allocator : public base_allocator {}; /** Default device allocator based on the one provided by RMM */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override - { + void* allocate(std::size_t n, cudaStream_t stream) override { void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override - { + void deallocate(void* p, std::size_t n, cudaStream_t stream) override { rmm::mr::get_current_device_resource()->deallocate(p, n, stream); } }; // class default_allocator diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp index 2b9d84368f..39b5674ce4 100644 --- a/cpp/include/raft/mr/device/buffer.hpp +++ b/cpp/include/raft/mr/device/buffer.hpp @@ -46,11 +46,11 @@ namespace device { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -60,9 +60,7 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) - { - } + : buffer_base(alloc, stream, n) {} }; // class buffer }; // namespace device diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp index 62b6826211..8af266d4f0 100644 --- a/cpp/include/raft/mr/host/allocator.hpp +++ b/cpp/include/raft/mr/host/allocator.hpp @@ -34,23 +34,20 @@ namespace host { * further to the ones listed in `Allocator`: * - Allocations don't need to be zero copy accessible form a device. */ -class allocator : public base_allocator { -}; +class allocator : public base_allocator {}; /** Default cudaMallocHost/cudaFreeHost based host allocator */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override - { + void* allocate(std::size_t n, cudaStream_t stream) override { void* ptr = nullptr; CUDA_CHECK(cudaMallocHost(&ptr, n)); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override - { - // Must call _NO_THROW here since this is called frequently from object - // destructors which are "nothrow" by default + void deallocate(void* p, std::size_t n, cudaStream_t stream) override { + //Must call _NO_THROW here since this is called frequently from object + //destructors which are "nothrow" by default CUDA_CHECK_NO_THROW(cudaFreeHost(p)); } }; // class default_allocator diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp index 52475ad6ec..3c505bf2ed 100644 --- a/cpp/include/raft/mr/host/buffer.hpp +++ b/cpp/include/raft/mr/host/buffer.hpp @@ -48,11 +48,11 @@ namespace host { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -62,15 +62,14 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, const device::buffer& other) - : buffer_base(alloc, other.get_stream(), other.size()) - { - if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); } + : buffer_base(alloc, other.get_stream(), other.size()) { + if (other.size() > 0) { + raft::copy(data_, other.data(), other.size(), other.get_stream()); + } } buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) - { - } + : buffer_base(alloc, stream, n) {} reference operator[](size_type pos) { return data_[pos]; } diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh index 5267770e8a..56710ea81f 100644 --- a/cpp/include/raft/random/rng.cuh +++ b/cpp/include/raft/random/rng.cuh @@ -43,9 +43,10 @@ enum GeneratorType { GenKiss99 }; -template -__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp) -{ +template +__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, + LenType len, Lambda randOp) { LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; detail::Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -57,10 +58,10 @@ __global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType } // used for Box-Muller type transformations -template -__global__ void rand2Kernel( - uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op) -{ +template +__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, + LenType len, Lambda2 rand2Op) { LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; detail::Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -76,9 +77,8 @@ __global__ void rand2Kernel( } template -__global__ void constFillKernel(Type* ptr, int len, Type val) -{ - unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; +__global__ void constFillKernel(Type *ptr, int len, Type val) { + unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; const unsigned stride = gridDim.x * blockDim.x; for (unsigned idx = tid; idx < len; idx += stride) { ptr[idx] = val; @@ -99,20 +99,19 @@ __global__ void constFillKernel(Type* ptr, int len, Type val) * @{ */ template -DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2) -{ - constexpr Type twoPi = Type(2.0) * Type(3.141592654); +DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, + Type sigma2, Type mu2) { + constexpr Type twoPi = Type(2.0) * Type(3.141592654); constexpr Type minus2 = -Type(2.0); - Type R = raft::mySqrt(minus2 * raft::myLog(val1)); - Type theta = twoPi * val2; + Type R = raft::mySqrt(minus2 * raft::myLog(val1)); + Type theta = twoPi * val2; Type s, c; raft::mySinCos(theta, s, c); val1 = R * c * sigma1 + mu1; val2 = R * s * sigma2 + mu2; } template -DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1) -{ +DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu1); } /** @} */ @@ -132,8 +131,7 @@ class Rng { // simple heuristic to make sure all SMs will be occupied properly // and also not too many initialization calls will be made by each thread nBlocks(4 * getMultiProcessorCount()), - gen() - { + gen() { seed(_s); } @@ -144,8 +142,7 @@ class Rng { * function of timestamp. Another example is to use the c++11's * `std::random_device` for setting seed. */ - void seed(uint64_t _s) - { + void seed(uint64_t _s) { gen.seed(_s); offset = 0; } @@ -161,8 +158,7 @@ class Rng { * @param[out] b intercept parameter */ template - void affine_transform_params(IdxT n, IdxT& a, IdxT& b) - { + void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { // always keep 'a' to be coprime to 'n' a = gen() % n; while (gcd(a, n) != 1) { @@ -185,24 +181,27 @@ class Rng { * @{ */ template - void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream) - { + void uniform(Type *ptr, LenType len, Type start, Type end, + cudaStream_t stream) { static_assert(std::is_floating_point::value, "Type for 'uniform' can only be floating point!"); custom_distribution( - ptr, - len, - [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; }, + ptr, len, + [=] __device__(Type val, LenType idx) { + return (val * (end - start)) + start; + }, stream); } template - void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream) - { - static_assert(std::is_integral::value, "Type for 'uniformInt' can only be integer!"); + void uniformInt(IntType *ptr, LenType len, IntType start, IntType end, + cudaStream_t stream) { + static_assert(std::is_integral::value, + "Type for 'uniformInt' can only be integer!"); custom_distribution( - ptr, - len, - [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; }, + ptr, len, + [=] __device__(IntType val, LenType idx) { + return (val % (end - start)) + start; + }, stream); } /** @} */ @@ -219,37 +218,28 @@ class Rng { * @{ */ template - void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) - { + void normal(Type *ptr, LenType len, Type mu, Type sigma, + cudaStream_t stream) { static_assert(std::is_floating_point::value, "Type for 'normal' can only be floating point!"); rand2Impl( - offset, - ptr, - len, + offset, ptr, len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, - nBlocks, - type, - stream); + NumThreads, nBlocks, type, stream); } template - void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream) - { - static_assert(std::is_integral::value, "Type for 'normalInt' can only be integer!"); + void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma, + cudaStream_t stream) { + static_assert(std::is_integral::value, + "Type for 'normalInt' can only be integer!"); rand2Impl( - offset, - ptr, - len, - [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) { + offset, ptr, len, + [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, - nBlocks, - type, - stream); + NumThreads, nBlocks, type, stream); } /** @} */ @@ -274,32 +264,21 @@ class Rng { * @param stream stream where to launch the kernel */ template - void normalTable(Type* ptr, - LenType n_rows, - LenType n_cols, - const Type* mu, - const Type* sigma_vec, - Type sigma, - cudaStream_t stream) - { + void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, + const Type *sigma_vec, Type sigma, cudaStream_t stream) { rand2Impl( - offset, - ptr, - n_rows * n_cols, + offset, ptr, n_rows * n_cols, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { // yikes! use fast-int-div - auto col1 = idx1 % n_cols; - auto col2 = idx2 % n_cols; + auto col1 = idx1 % n_cols; + auto col2 = idx2 % n_cols; auto mean1 = mu[col1]; auto mean2 = mu[col2]; - auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; - auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; + auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; + auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; box_muller_transform(val1, val2, sig1, mean1, sig2, mean2); }, - NumThreads, - nBlocks, - type, - stream); + NumThreads, nBlocks, type, stream); } /** @@ -312,8 +291,7 @@ class Rng { * @param stream stream where to launch the kernel */ template - void fill(Type* ptr, LenType len, Type val, cudaStream_t stream) - { + void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { constFillKernel<<>>(ptr, len, val); CUDA_CHECK(cudaPeekAtLastError()); } @@ -331,10 +309,10 @@ class Rng { * @param[in] stream stream where to launch the kernel */ template - void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream) - { + void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { custom_distribution( - ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream); + ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, + stream); } /** @@ -348,14 +326,15 @@ class Rng { * @param stream stream where to launch the kernel */ template - void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream) - { + void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, + cudaStream_t stream) { static_assert(std::is_floating_point::value, "Type for 'scaled_bernoulli' can only be floating point!"); custom_distribution( - ptr, - len, - [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; }, + ptr, len, + [=] __device__(Type val, LenType idx) { + return val > prob ? -scale : scale; + }, stream); } @@ -371,12 +350,12 @@ class Rng { * @note https://en.wikipedia.org/wiki/Gumbel_distribution */ template - void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream) - { + void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { custom_distribution( - ptr, - len, - [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); }, + ptr, len, + [=] __device__(Type val, LenType idx) { + return mu - beta * raft::myLog(-raft::myLog(val)); + }, stream); } @@ -391,21 +370,16 @@ class Rng { * @param stream stream where to launch the kernel */ template - void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) - { + void lognormal(Type *ptr, LenType len, Type mu, Type sigma, + cudaStream_t stream) { rand2Impl( - offset, - ptr, - len, + offset, ptr, len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); val1 = raft::myExp(val1); val2 = raft::myExp(val2); }, - NumThreads, - nBlocks, - type, - stream); + NumThreads, nBlocks, type, stream); } /** @@ -419,11 +393,10 @@ class Rng { * @param stream stream where to launch the kernel */ template - void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) - { + void logistic(Type *ptr, LenType len, Type mu, Type scale, + cudaStream_t stream) { custom_distribution( - ptr, - len, + ptr, len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return mu - scale * raft::myLog(one / val - one); @@ -441,11 +414,9 @@ class Rng { * @param stream stream where to launch the kernel */ template - void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream) - { + void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { custom_distribution( - ptr, - len, + ptr, len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return -raft::myLog(one - val) / lambda; @@ -463,11 +434,9 @@ class Rng { * @param stream stream where to launch the kernel */ template - void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream) - { + void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { custom_distribution( - ptr, - len, + ptr, len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; constexpr Type two = (Type)2.0; @@ -487,14 +456,13 @@ class Rng { * @param stream stream where to launch the kernel */ template - void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) - { + void laplace(Type *ptr, LenType len, Type mu, Type scale, + cudaStream_t stream) { custom_distribution( - ptr, - len, + ptr, len, [=] __device__(Type val, LenType idx) { - constexpr Type one = (Type)1.0; - constexpr Type two = (Type)2.0; + constexpr Type one = (Type)1.0; + constexpr Type two = (Type)2.0; constexpr Type oneHalf = (Type)0.5; Type out; if (val <= oneHalf) { @@ -534,44 +502,43 @@ class Rng { * @param stream cuda stream */ template - void sampleWithoutReplacement(const raft::handle_t& handle, - DataT* out, - IdxT* outIdx, - const DataT* in, - const WeightsT* wts, - IdxT sampledLen, - IdxT len, - cudaStream_t stream) - { - ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); - - std::shared_ptr allocator = handle.get_device_allocator(); + void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, + IdxT *outIdx, const DataT *in, + const WeightsT *wts, IdxT sampledLen, IdxT len, + cudaStream_t stream) { + ASSERT(sampledLen <= len, + "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); + + std::shared_ptr allocator = + handle.get_device_allocator(); raft::mr::device::buffer expWts(allocator, stream, len); raft::mr::device::buffer sortedWts(allocator, stream, len); raft::mr::device::buffer inIdx(allocator, stream, len); raft::mr::device::buffer outIdxBuff(allocator, stream, len); - auto* inIdxPtr = inIdx.data(); + auto *inIdxPtr = inIdx.data(); // generate modified weights custom_distribution( - expWts.data(), - len, + expWts.data(), len, [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) { - inIdxPtr[idx] = idx; + inIdxPtr[idx] = idx; constexpr WeightsT one = (WeightsT)1.0; - auto exp = -raft::myLog(one - val); - if (wts != nullptr) { return exp / wts[idx]; } + auto exp = -raft::myLog(one - val); + if (wts != nullptr) { + return exp / wts[idx]; + } return exp; }, stream); ///@todo: use a more efficient partitioning scheme instead of full sort // sort the array and pick the top sampledLen items - IdxT* outIdxPtr = outIdxBuff.data(); + IdxT *outIdxPtr = outIdxBuff.data(); raft::mr::device::buffer workspace(allocator, stream); - sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); + sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, + (int)len, stream); if (outIdx != nullptr) { - CUDA_CHECK(cudaMemcpyAsync( - outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, + cudaMemcpyDeviceToDevice, stream)); } scatter(out, in, outIdxPtr, sampledLen, stream); } @@ -591,15 +558,17 @@ class Rng { * @param[in] stream cuda stream * @{ */ - template - void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) - { + template + void custom_distribution(OutType *ptr, LenType len, Lambda randOp, + cudaStream_t stream) { randImpl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } - template - void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) - { + template + void custom_distribution2(OutType *ptr, LenType len, Lambda randOp, + cudaStream_t stream) { rand2Impl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } @@ -622,10 +591,12 @@ class Rng { static const int NumThreads = 256; template - uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks) - { + uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len, + int nThreads, int nBlocks) { LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads)); - if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; } + if (IsNormal && itemsPerThread % 2 == 1) { + ++itemsPerThread; + } // curand uses 2 32b uint's to generate one double uint64_t factor = sizeof(Type) / sizeof(float); if (factor == 0) ++factor; @@ -633,26 +604,22 @@ class Rng { // If not, then generate new seed and start from zero offset uint64_t newOffset = offset + LenType(itemsPerThread) * factor; if (newOffset < offset) { - offset = 0; - seed = gen(); + offset = 0; + seed = gen(); newOffset = itemsPerThread * factor; } return newOffset; } - template - void randImpl(uint64_t& offset, - OutType* ptr, - LenType len, - Lambda randOp, - int nThreads, - int nBlocks, - GeneratorType type, - cudaStream_t stream) - { + template + void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp, + int nThreads, int nBlocks, GeneratorType type, + cudaStream_t stream) { if (len <= 0) return; - uint64_t seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); + uint64_t seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, + nThreads, nBlocks); switch (type) { case GenPhilox: randKernel @@ -666,28 +633,26 @@ class Rng { randKernel <<>>(seed, offset, ptr, len, randOp); break; - default: ASSERT(false, "randImpl: Incorrect generator type! %d", type); + default: + ASSERT(false, "randImpl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; } - template - void rand2Impl(uint64_t& offset, - OutType* ptr, - LenType len, - Lambda2 rand2Op, - int nThreads, - int nBlocks, - GeneratorType type, - cudaStream_t stream) - { + template + void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op, + int nThreads, int nBlocks, GeneratorType type, + cudaStream_t stream) { if (len <= 0) return; - auto seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); + auto seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, + nThreads, nBlocks); switch (type) { case GenPhilox: - rand2Kernel + rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; case GenTaps: @@ -695,10 +660,12 @@ class Rng { <<>>(seed, offset, ptr, len, rand2Op); break; case GenKiss99: - rand2Kernel + rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; - default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); + default: + ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh index 485f4ddd68..d44c6f018b 100644 --- a/cpp/include/raft/random/rng_impl.cuh +++ b/cpp/include/raft/random/rng_impl.cuh @@ -33,8 +33,7 @@ struct PhiloxGenerator { * @param subsequence as found in curand docs * @param offset as found in curand docs */ - DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) - { + DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { curand_init(seed, subsequence, offset, &state); } @@ -45,21 +44,18 @@ struct PhiloxGenerator { DI void next(float& ret) { ret = curand_uniform(&(this->state)); } DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); } DI void next(uint32_t& ret) { ret = curand(&(this->state)); } - DI void next(uint64_t& ret) - { + DI void next(uint64_t& ret) { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t& ret) - { + DI void next(int32_t& ret) { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t& ret) - { + DI void next(int64_t& ret) { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -80,9 +76,8 @@ struct TapsGenerator { * @param subsequence unused * @param offset unused */ - DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) - { - uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; + DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; uint64_t stride = blockDim.x * gridDim.x; delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride; stride *= blockDim.y * gridDim.y; @@ -95,36 +90,31 @@ struct TapsGenerator { * @{ */ template - DI void next(Type& ret) - { + DI void next(Type& ret) { constexpr double ULL_LARGE = 1.8446744073709551614e19; uint64_t val; next(val); ret = static_cast(val); ret /= static_cast(ULL_LARGE); } - DI void next(uint64_t& ret) - { + DI void next(uint64_t& ret) { constexpr uint64_t TAPS = 0x8000100040002000ULL; - constexpr int ROUNDS = 128; + constexpr int ROUNDS = 128; for (int i = 0; i < ROUNDS; i++) state = (state >> 1) ^ (-(state & 1ULL) & TAPS); ret = state; } - DI void next(uint32_t& ret) - { + DI void next(uint32_t& ret) { uint64_t val; next(val); ret = (uint32_t)val; } - DI void next(int32_t& ret) - { + DI void next(int32_t& ret) { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t& ret) - { + DI void next(int64_t& ret) { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -145,49 +135,46 @@ struct Kiss99Generator { * @param subsequence unused * @param offset unused */ - DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); } + DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + initKiss99(seed); + } /** * @defgroup NextRand Generate the next random number * @{ */ template - DI void next(Type& ret) - { + DI void next(Type& ret) { constexpr double U_LARGE = 4.294967295e9; uint32_t val; next(val); ret = static_cast(val); ret /= static_cast(U_LARGE); } - DI void next(uint32_t& ret) - { + DI void next(uint32_t& ret) { uint32_t MWC; - z = 36969 * (z & 65535) + (z >> 16); - w = 18000 * (w & 65535) + (w >> 16); + z = 36969 * (z & 65535) + (z >> 16); + w = 18000 * (w & 65535) + (w >> 16); MWC = ((z << 16) + w); jsr ^= (jsr << 17); jsr ^= (jsr >> 13); jsr ^= (jsr << 5); jcong = 69069 * jcong + 1234567; - MWC = ((MWC ^ jcong) + jsr); - ret = MWC; + MWC = ((MWC ^ jcong) + jsr); + ret = MWC; } - DI void next(uint64_t& ret) - { + DI void next(uint64_t& ret) { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t& ret) - { + DI void next(int32_t& ret) { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t& ret) - { + DI void next(int64_t& ret) { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -206,8 +193,7 @@ struct Kiss99Generator { // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower // 128 bits. It uses 32-bit wide multiply only. - DI void mulByFnv1a128Prime(uint32_t* h) - { + DI void mulByFnv1a128Prime(uint32_t* h) { typedef union { uint32_t u32[2]; uint64_t u64[1]; @@ -231,12 +217,12 @@ struct Kiss99Generator { // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]); // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]); uint32_t carry = 0; - h[0] = h0p0.u32[0]; + h[0] = h0p0.u32[0]; - h[1] = h0p0.u32[1] + h1p0.u32[0]; + h[1] = h0p0.u32[1] + h1p0.u32[0]; carry = h[1] < h0p0.u32[1] ? 1 : 0; - h[2] = h1p0.u32[1] + carry; + h[2] = h1p0.u32[1] + carry; carry = h[2] < h1p0.u32[1] ? 1 : 0; h[2] += h2p0.u32[0]; carry = h[2] < h2p0.u32[0] ? carry + 1 : carry; @@ -247,8 +233,7 @@ struct Kiss99Generator { return; } - DI void fnv1a128(uint32_t* hash, uint32_t txt) - { + DI void fnv1a128(uint32_t* hash, uint32_t txt) { hash[0] ^= (txt >> 0) & 0xFF; mulByFnv1a128Prime(hash); hash[0] ^= (txt >> 8) & 0xFF; @@ -259,8 +244,7 @@ struct Kiss99Generator { mulByFnv1a128Prime(hash); } - DI void initKiss99(uint64_t seed) - { + DI void initKiss99(uint64_t seed) { // Initialize hash to 128-bit FNV1a basis uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL}; @@ -275,9 +259,9 @@ struct Kiss99Generator { fnv1a128(hash, uint32_t(seed >> 32)); // Initialize KISS99 state with hash - z = hash[0]; - w = hash[1]; - jsr = hash[2]; + z = hash[0]; + w = hash[1]; + jsr = hash[2]; jcong = hash[3]; } }; @@ -289,13 +273,10 @@ struct Kiss99Generator { template struct Generator { DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) - : gen(seed, subsequence, offset) - { - } + : gen(seed, subsequence, offset) {} template - DI void next(Type& ret) - { + DI void next(Type& ret) { gen.next(ret); } diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh index 5d38bdf4a8..e367550060 100644 --- a/cpp/include/raft/sparse/convert/coo.cuh +++ b/cpp/include/raft/sparse/convert/coo.cuh @@ -37,18 +37,14 @@ namespace sparse { namespace convert { template -__global__ void csr_to_coo_kernel(const value_idx* row_ind, - value_idx m, - value_idx* coo_rows, - value_idx nnz) -{ +__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, + value_idx *coo_rows, value_idx nnz) { // row-based matrix 1 thread per row value_idx row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { value_idx start_idx = row_ind[row]; - value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); - for (value_idx i = start_idx; i < stop_idx; i++) - coo_rows[i] = row; + value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); + for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row; } } @@ -61,14 +57,14 @@ __global__ void csr_to_coo_kernel(const value_idx* row_ind, * @param stream: cuda stream to use */ template -void csr_to_coo( - const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream) -{ +void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows, + value_idx nnz, cudaStream_t stream) { // @TODO: Use cusparse for this. dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_to_coo_kernel<<>>(row_ind, m, coo_rows, nnz); + csr_to_coo_kernel + <<>>(row_ind, m, coo_rows, nnz); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index 2191f5edd1..a034bdbda8 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -44,33 +44,29 @@ namespace sparse { namespace convert { template -void coo_to_csr(const raft::handle_t& handle, - const int* srcRows, - const int* srcCols, - const value_t* srcVals, - int nnz, - int m, - int* dst_offsets, - int* dstCols, - value_t* dstVals) -{ - auto stream = handle.get_stream(); +void coo_to_csr(const raft::handle_t &handle, const int *srcRows, + const int *srcCols, const value_t *srcVals, int nnz, int m, + int *dst_offsets, int *dstCols, value_t *dstVals) { + auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); - auto d_alloc = handle.get_device_allocator(); + auto d_alloc = handle.get_device_allocator(); raft::mr::device::buffer dstRows(d_alloc, stream, nnz); - CUDA_CHECK( - cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK( - cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, + cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, + cudaMemcpyDeviceToDevice, stream)); auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( cusparseHandle, m, m, nnz, srcRows, srcCols, stream); raft::mr::device::buffer pBuffer(d_alloc, stream, buffSize); raft::mr::device::buffer P(d_alloc, stream, nnz); - CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); - raft::sparse::cusparsecoosortByRow( - cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream); - raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream); - raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream); + CUSPARSE_CHECK( + cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); + raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(), + dstCols, P.data(), pBuffer.data(), stream); + raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), + stream); + raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, + dst_offsets, stream); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -89,20 +85,14 @@ void coo_to_csr(const raft::handle_t& handle, * @param stream cuda stream to use * @param fused_op: the fused operation */ -template void> -void csr_adj_graph_batched(const Index_* row_ind, - Index_ total_rows, - Index_ nnz, - Index_ batchSize, - const bool* adj, - Index_* row_ind_ptr, - cudaStream_t stream, - Lambda fused_op) -{ +template void> +void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, + Index_ batchSize, const bool *adj, + Index_ *row_ind_ptr, cudaStream_t stream, + Lambda fused_op) { op::csr_row_op( - row_ind, - batchSize, - nnz, + row_ind, batchSize, nnz, [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__( Index_ row, Index_ start_idx, Index_ stop_idx) { fused_op(row, start_idx, stop_idx); @@ -118,23 +108,14 @@ void csr_adj_graph_batched(const Index_* row_ind, stream); } -template void> -void csr_adj_graph_batched(const Index_* row_ind, - Index_ total_rows, - Index_ nnz, - Index_ batchSize, - const bool* adj, - Index_* row_ind_ptr, - cudaStream_t stream) -{ - csr_adj_graph_batched(row_ind, - total_rows, - nnz, - batchSize, - adj, - row_ind_ptr, - stream, - [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); +template void> +void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, + Index_ batchSize, const bool *adj, + Index_ *row_ind_ptr, cudaStream_t stream) { + csr_adj_graph_batched( + row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, + [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); } /** @@ -150,17 +131,13 @@ void csr_adj_graph_batched(const Index_* row_ind, * @param stream cuda stream to use * @param fused_op the fused operation */ -template void> -void csr_adj_graph(const Index_* row_ind, - Index_ total_rows, - Index_ nnz, - const bool* adj, - Index_* row_ind_ptr, - cudaStream_t stream, - Lambda fused_op) -{ - csr_adj_graph_batched( - row_ind, total_rows, nnz, total_rows, adj, row_ind_ptr, stream, fused_op); +template void> +void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, + const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream, + Lambda fused_op) { + csr_adj_graph_batched(row_ind, total_rows, nnz, total_rows, + adj, row_ind_ptr, stream, fused_op); } /** @@ -174,13 +151,9 @@ void csr_adj_graph(const Index_* row_ind, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(const T* rows, - int nnz, - T* row_ind, - int m, +void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, std::shared_ptr d_alloc, - cudaStream_t stream) -{ + cudaStream_t stream) { raft::mr::device::buffer row_counts(d_alloc, stream, m); CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); @@ -188,9 +161,11 @@ void sorted_coo_to_csr(const T* rows, linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d); + thrust::device_ptr row_counts_d = + thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); + exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + c_ind_d); } /** @@ -202,12 +177,11 @@ void sorted_coo_to_csr(const T* rows, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO* coo, - int* row_ind, +void sorted_coo_to_csr(COO *coo, int *row_ind, std::shared_ptr d_alloc, - cudaStream_t stream) -{ - sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, stream); + cudaStream_t stream) { + sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, + stream); } }; // end NAMESPACE convert diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh index e90882b501..299f9d36d4 100644 --- a/cpp/include/raft/sparse/convert/dense.cuh +++ b/cpp/include/raft/sparse/convert/dense.cuh @@ -37,20 +37,22 @@ namespace sparse { namespace convert { template -__global__ void csr_to_dense_warp_per_row_kernel( - int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a) -{ +__global__ void csr_to_dense_warp_per_row_kernel(int n_cols, + const value_t *csrVal, + const int *csrRowPtr, + const int *csrColInd, + value_t *a) { int row = blockIdx.x; int tid = threadIdx.x; int colStart = csrRowPtr[row]; - int colEnd = csrRowPtr[row + 1]; - int rowNnz = colEnd - colStart; + int colEnd = csrRowPtr[row + 1]; + int rowNnz = colEnd - colStart; for (int i = tid; i < rowNnz; i += blockDim.x) { int colIdx = colStart + i; if (colIdx < colEnd) { - int col = csrColInd[colIdx]; + int col = csrColInd[colIdx]; a[row * n_cols + col] = csrVal[colIdx]; } } @@ -75,17 +77,10 @@ __global__ void csr_to_dense_warp_per_row_kernel( * @param[in] row_major : Is row-major output desired? */ template -void csr_to_dense(cusparseHandle_t handle, - value_idx nrows, - value_idx ncols, - const value_idx* csr_indptr, - const value_idx* csr_indices, - const value_t* csr_data, - value_idx lda, - value_t* out, - cudaStream_t stream, - bool row_major = true) -{ +void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, + const value_idx *csr_indptr, const value_idx *csr_indices, + const value_t *csr_data, value_idx lda, value_t *out, + cudaStream_t stream, bool row_major = true) { if (!row_major) { /** * If we need col-major, use cusparse. @@ -96,13 +91,15 @@ void csr_to_dense(cusparseHandle_t handle, CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense( - handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream)); + handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, + lda, stream)); CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat)); } else { int blockdim = block_dim(ncols); - CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); + CUDA_CHECK( + cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); csr_to_dense_warp_per_row_kernel<<>>( ncols, csr_data, csr_indptr, csr_indices, out); } diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index 348ed5eab2..73120fea8c 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -68,87 +68,83 @@ class COO { Index_Type n_cols; /** - * @param d_alloc: the device allocator to use for the underlying buffers - * @param stream: CUDA stream to use - */ + * @param d_alloc: the device allocator to use for the underlying buffers + * @param stream: CUDA stream to use + */ COO(std::shared_ptr d_alloc, cudaStream_t stream) : rows_arr(d_alloc, stream, 0), cols_arr(d_alloc, stream, 0), vals_arr(d_alloc, stream, 0), nnz(0), n_rows(0), - n_cols(0) - { - } + n_cols(0) {} /** - * @param rows: coo rows array - * @param cols: coo cols array - * @param vals: coo vals array - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - */ - COO(raft::mr::device::buffer& rows, - raft::mr::device::buffer& cols, - raft::mr::device::buffer& vals, - Index_Type nnz, - Index_Type n_rows = 0, + * @param rows: coo rows array + * @param cols: coo cols array + * @param vals: coo vals array + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + */ + COO(raft::mr::device::buffer &rows, + raft::mr::device::buffer &cols, + raft::mr::device::buffer &vals, Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0) - : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols) - { - } + : rows_arr(rows), + cols_arr(cols), + vals_arr(vals), + nnz(nnz), + n_rows(n_rows), + n_cols(n_cols) {} /** - * @param d_alloc: the device allocator use - * @param stream: CUDA stream to use - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - * @param init: initialize arrays with zeros - */ - COO(std::shared_ptr d_alloc, - cudaStream_t stream, - Index_Type nnz, - Index_Type n_rows = 0, - Index_Type n_cols = 0, - bool init = true) + * @param d_alloc: the device allocator use + * @param stream: CUDA stream to use + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + * @param init: initialize arrays with zeros + */ + COO(std::shared_ptr d_alloc, cudaStream_t stream, + Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0, + bool init = true) : rows_arr(d_alloc, stream, nnz), cols_arr(d_alloc, stream, nnz), vals_arr(d_alloc, stream, nnz), nnz(nnz), n_rows(n_rows), - n_cols(n_cols) - { + n_cols(n_cols) { if (init) init_arrays(stream); } - void init_arrays(cudaStream_t stream) - { - CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); + void init_arrays(cudaStream_t stream) { + CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, + this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, + this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK( + cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); } ~COO() {} /** - * @brief Size should be > 0, with the number of rows - * and cols in the dense matrix being > 0. - */ - bool validate_size() const - { + * @brief Size should be > 0, with the number of rows + * and cols in the dense matrix being > 0. + */ + bool validate_size() const { if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false; return true; } /** - * @brief If the underlying arrays have not been set, - * return false. Otherwise true. - */ - bool validate_mem() const - { - if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || this->vals_arr.size() == 0) { + * @brief If the underlying arrays have not been set, + * return false. Otherwise true. + */ + bool validate_mem() const { + if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || + this->vals_arr.size() == 0) { return false; } @@ -158,30 +154,33 @@ class COO { /* * @brief Returns the rows array */ - Index_Type* rows() { return this->rows_arr.data(); } + Index_Type *rows() { return this->rows_arr.data(); } /** * @brief Returns the cols array */ - Index_Type* cols() { return this->cols_arr.data(); } + Index_Type *cols() { return this->cols_arr.data(); } /** * @brief Returns the vals array */ - T* vals() { return this->vals_arr.data(); } + T *vals() { return this->vals_arr.data(); } /** - * @brief Send human-readable state information to output stream - */ - friend std::ostream& operator<<(std::ostream& out, const COO& c) - { + * @brief Send human-readable state information to output stream + */ + friend std::ostream &operator<<(std::ostream &out, + const COO &c) { if (c.validate_size() && c.validate_mem()) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl; - out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl; - out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl; + out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) + << std::endl; + out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) + << std::endl; + out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) + << std::endl; out << "nnz=" << c.nnz << std::endl; out << "n_rows=" << c.n_rows << std::endl; out << "n_cols=" << c.n_cols << std::endl; @@ -195,59 +194,58 @@ class COO { } /** - * @brief Set the number of rows and cols - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - */ - void setSize(int n_rows, int n_cols) - { + * @brief Set the number of rows and cols + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + */ + void setSize(int n_rows, int n_cols) { this->n_rows = n_rows; this->n_cols = n_cols; } /** - * @brief Set the number of rows and cols for a square dense matrix - * @param n: number of rows and cols - */ - void setSize(int n) - { + * @brief Set the number of rows and cols for a square dense matrix + * @param n: number of rows and cols + */ + void setSize(int n) { this->n_rows = n; this->n_cols = n; } /** - * @brief Allocate the underlying arrays - * @param nnz: size of underlying row/col/val arrays - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); } + * @brief Allocate the underlying arrays + * @param nnz: size of underlying row/col/val arrays + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, bool init, cudaStream_t stream) { + this->allocate(nnz, 0, init, stream); + } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param size: the number of rows/cols in a square dense matrix - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, int size, bool init, cudaStream_t stream) - { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param size: the number of rows/cols in a square dense matrix + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, int size, bool init, cudaStream_t stream) { this->allocate(nnz, size, size, init, stream); } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - * @param init: should values be initialized to 0? - * @param stream: stream to use for init - */ - void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream) - { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + * @param init: should values be initialized to 0? + * @param stream: stream to use for init + */ + void allocate(int nnz, int n_rows, int n_cols, bool init, + cudaStream_t stream) { this->n_rows = n_rows; this->n_cols = n_cols; - this->nnz = nnz; + this->nnz = nnz; this->rows_arr.resize(this->nnz, stream); this->cols_arr.resize(this->nnz, stream); diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index 17f3c735af..bc4a68d296 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -41,64 +41,57 @@ namespace sparse { struct WeakCCState { public: - bool* m; - WeakCCState(bool* m) : m(m) {} + bool *m; + WeakCCState(bool *m) : m(m) {} }; template -__global__ void weak_cc_label_device(Index_* __restrict__ labels, - const Index_* __restrict__ row_ind, - const Index_* __restrict__ row_ind_ptr, - Index_ nnz, - bool* __restrict__ m, - Index_ start_vertex_id, - Index_ batch_size, - Index_ N, - Lambda filter_op) -{ - Index_ tid = threadIdx.x + blockIdx.x * TPB_X; +__global__ void weak_cc_label_device(Index_ *__restrict__ labels, + const Index_ *__restrict__ row_ind, + const Index_ *__restrict__ row_ind_ptr, + Index_ nnz, bool *__restrict__ m, + Index_ start_vertex_id, Index_ batch_size, + Index_ N, Lambda filter_op) { + Index_ tid = threadIdx.x + blockIdx.x * TPB_X; Index_ global_id = tid + start_vertex_id; if (tid < batch_size && global_id < N) { Index_ start = __ldg(row_ind + tid); Index_ ci, cj; - bool ci_mod = false; - ci = labels[global_id]; + bool ci_mod = false; + ci = labels[global_id]; bool ci_allow_prop = filter_op(global_id); Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind); /// TODO: add one element to row_ind and avoid get_stop_idx for (Index_ j = start; j < end; j++) { - Index_ j_ind = __ldg(row_ind_ptr + j); - cj = labels[j_ind]; + Index_ j_ind = __ldg(row_ind_ptr + j); + cj = labels[j_ind]; bool cj_allow_prop = filter_op(j_ind); if (ci < cj && ci_allow_prop) { if (sizeof(Index_) == 4) - atomicMin((int*)(labels + j_ind), ci); + atomicMin((int *)(labels + j_ind), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int*)(labels + j_ind), ci); + atomicMin((long long int *)(labels + j_ind), ci); if (cj_allow_prop) *m = true; } else if (ci > cj && cj_allow_prop) { - ci = cj; + ci = cj; ci_mod = true; } } if (ci_mod) { if (sizeof(Index_) == 4) - atomicMin((int*)(labels + global_id), ci); + atomicMin((int *)(labels + global_id), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int*)(labels + global_id), ci); + atomicMin((long long int *)(labels + global_id), ci); if (ci_allow_prop) *m = true; } } } template -__global__ void weak_cc_init_all_kernel(Index_* labels, - Index_ N, - Index_ MAX_LABEL, - Lambda filter_op) -{ +__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, + Index_ MAX_LABEL, Lambda filter_op) { Index_ tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (filter_op(tid)) @@ -130,25 +123,22 @@ __global__ void weak_cc_init_all_kernel(Index_* labels, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc_batched(Index_* labels, - const Index_* row_ind, - const Index_* row_ind_ptr, - Index_ nnz, - Index_ N, - Index_ start_vertex_id, - Index_ batch_size, - WeakCCState* state, - cudaStream_t stream, - Lambda filter_op) -{ - ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes"); +template bool> +void weak_cc_batched(Index_ *labels, const Index_ *row_ind, + const Index_ *row_ind_ptr, Index_ nnz, Index_ N, + Index_ start_vertex_id, Index_ batch_size, + WeakCCState *state, cudaStream_t stream, + Lambda filter_op) { + ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, + "Index_ should be 4 or 8 bytes"); bool host_m; Index_ MAX_LABEL = std::numeric_limits::max(); weak_cc_init_all_kernel - <<>>(labels, N, MAX_LABEL, filter_op); + <<>>( + labels, N, MAX_LABEL, filter_op); CUDA_CHECK(cudaPeekAtLastError()); int n_iters = 0; @@ -157,7 +147,8 @@ void weak_cc_batched(Index_* labels, weak_cc_label_device <<>>( - labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op); + labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, + batch_size, N, filter_op); CUDA_CHECK(cudaPeekAtLastError()); //** Updating m * @@ -189,25 +180,12 @@ void weak_cc_batched(Index_* labels, * @param stream the cuda stream to use */ template -void weak_cc_batched(Index_* labels, - const Index_* row_ind, - const Index_* row_ind_ptr, - Index_ nnz, - Index_ N, - Index_ start_vertex_id, - Index_ batch_size, - WeakCCState* state, - cudaStream_t stream) -{ - weak_cc_batched(labels, - row_ind, - row_ind_ptr, - nnz, - N, - start_vertex_id, - batch_size, - state, - stream, +void weak_cc_batched(Index_ *labels, const Index_ *row_ind, + const Index_ *row_ind_ptr, Index_ nnz, Index_ N, + Index_ start_vertex_id, Index_ batch_size, + WeakCCState *state, cudaStream_t stream) { + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id, + batch_size, state, stream, [] __device__(Index_ tid) { return true; }); } @@ -235,20 +213,17 @@ void weak_cc_batched(Index_* labels, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc(Index_* labels, - const Index_* row_ind, - const Index_* row_ind_ptr, - Index_ nnz, - Index_ N, +template bool> +void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, + Index_ nnz, Index_ N, std::shared_ptr d_alloc, - cudaStream_t stream, - Lambda filter_op) -{ + cudaStream_t stream, Lambda filter_op) { raft::mr::device::buffer m(d_alloc, stream, 1); WeakCCState state(m.data()); - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op); + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, + stream, filter_op); } /** @@ -274,18 +249,14 @@ void weak_cc(Index_* labels, * @param stream the cuda stream to use */ template -void weak_cc(Index_* labels, - const Index_* row_ind, - const Index_* row_ind_ptr, - Index_ nnz, - Index_ N, +void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, + Index_ nnz, Index_ N, std::shared_ptr d_alloc, - cudaStream_t stream) -{ + cudaStream_t stream) { raft::mr::device::buffer m(d_alloc, stream, 1); WeakCCState state(m.data()); - weak_cc_batched( - labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; }); + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, + stream, [](Index_) { return true; }); } }; // namespace sparse diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index 9d42ec34cb..360832f557 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -23,9 +23,10 @@ //#include #define _CUSPARSE_ERR_TO_STR(err) \ - case err: return #err; + case err: \ + return #err; -// Notes: +//Notes: //(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic; //(2.) to enforce a lower version, // @@ -42,15 +43,16 @@ namespace raft { * @brief Exception thrown when a cuSparse error is encountered. */ struct cusparse_error : public raft::exception { - explicit cusparse_error(char const* const message) : raft::exception(message) {} - explicit cusparse_error(std::string const& message) : raft::exception(message) {} + explicit cusparse_error(char const* const message) + : raft::exception(message) {} + explicit cusparse_error(std::string const& message) + : raft::exception(message) {} }; namespace sparse { namespace detail { -inline const char* cusparse_error_to_string(cusparseStatus_t err) -{ +inline const char* cusparse_error_to_string(cusparseStatus_t err) { #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 return cusparseGetErrorString(err); #else // CUDART_VERSION @@ -63,7 +65,8 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); - default: return "CUSPARSE_STATUS_UNKNOWN"; + default: + return "CUSPARSE_STATUS_UNKNOWN"; }; #endif // CUDART_VERSION } @@ -85,11 +88,8 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) cusparseStatus_t const status = (call); \ if (CUSPARSE_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "cuSparse error encountered at: ", \ - "call='%s', Reason=%d:%s", \ - #call, \ - status, \ + SET_ERROR_MSG(msg, "cuSparse error encountered at: ", \ + "call='%s', Reason=%d:%s", #call, status, \ raft::sparse::detail::cusparse_error_to_string(status)); \ throw raft::cusparse_error(msg); \ } \ @@ -100,15 +100,13 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) //@todo: use logger here once logging is enabled /** check for cusparse runtime API errors but do not assert */ -#define CUSPARSE_CHECK_NO_THROW(call) \ - do { \ - cusparseStatus_t err = call; \ - if (err != CUSPARSE_STATUS_SUCCESS) { \ - printf("CUSPARSE call='%s' got errorcode=%d err=%s", \ - #call, \ - err, \ - raft::sparse::detail::cusparse_error_to_string(err)); \ - } \ +#define CUSPARSE_CHECK_NO_THROW(call) \ + do { \ + cusparseStatus_t err = call; \ + if (err != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ + raft::sparse::detail::cusparse_error_to_string(err)); \ + } \ } while (0) namespace raft { @@ -119,34 +117,28 @@ namespace sparse { * @{ */ template -cusparseStatus_t cusparsegthr( - cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream); +cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals, + T* vals_sorted, int* d_P, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, - int nnz, - const double* vals, - double* vals_sorted, - int* d_P, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, + const double* vals, double* vals_sorted, + int* d_P, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); + return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, + CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, - int nnz, - const float* vals, - float* vals_sorted, - int* d_P, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, + const float* vals, float* vals_sorted, + int* d_P, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); + return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, + CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } /** @} */ @@ -156,18 +148,15 @@ inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, * @{ */ template -void cusparsecoo2csr( - cusparseHandle_t handle, const T* cooRowInd, int nnz, int m, T* csrRowPtr, cudaStream_t stream); +void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz, + int m, T* csrRowPtr, cudaStream_t stream); template <> -inline void cusparsecoo2csr(cusparseHandle_t handle, - const int* cooRowInd, - int nnz, - int m, - int* csrRowPtr, - cudaStream_t stream) -{ +inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, + int nnz, int m, int* csrRowPtr, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, + CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -177,54 +166,30 @@ inline void cusparsecoo2csr(cusparseHandle_t handle, */ template size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, - int m, - int n, - int nnz, - const T* cooRows, - const T* cooCols, - cudaStream_t stream); + cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows, + const T* cooCols, cudaStream_t stream); template <> inline size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, - int m, - int n, - int nnz, - const int* cooRows, - const int* cooCols, - cudaStream_t stream) -{ + cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows, + const int* cooCols, cudaStream_t stream) { size_t val; CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); + CUSPARSE_CHECK( + cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); return val; } template void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, - int m, - int n, - int nnz, - T* cooRows, - T* cooCols, - T* P, - void* pBuffer, - cudaStream_t stream); + cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P, + void* pBuffer, cudaStream_t stream); template <> inline void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, - int m, - int n, - int nnz, - int* cooRows, - int* cooCols, - int* P, - void* pBuffer, - cudaStream_t stream) -{ + cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols, + int* P, void* pBuffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); + CUSPARSE_CHECK( + cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); } /** @} */ @@ -234,67 +199,37 @@ inline void cusparsecoosortByRow( // NOLINT */ template cusparseStatus_t cusparsegemmi( // NOLINT - cusparseHandle_t handle, - int m, - int n, - int k, - int nnz, - const T* alpha, - const T* A, - int lda, - const T* cscValB, - const int* cscColPtrB, - const int* cscRowIndB, - const T* beta, - T* C, - int ldc, - cudaStream_t stream); + cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, + const T* A, int lda, const T* cscValB, const int* cscColPtrB, + const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, - int m, - int n, - int k, - int nnz, - const float* alpha, - const float* A, - int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, + int k, int nnz, const float* alpha, + const float* A, int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, - const float* beta, - float* C, - int ldc, - cudaStream_t stream) -{ + const int* cscRowIndB, const float* beta, + float* C, int ldc, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgemmi( - handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, + cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, - int m, - int n, - int k, - int nnz, - const double* alpha, - const double* A, - int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, + int k, int nnz, const double* alpha, + const double* A, int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, - const double* beta, - double* C, - int ldc, - cudaStream_t stream) -{ + const int* cscRowIndB, const double* beta, + double* C, int ldc, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgemmi( - handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, + cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } /** @} */ @@ -306,94 +241,49 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, */ template cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, - int64_t cols, - int64_t nnz, - IndexT* csrRowOffsets, - IndexT* csrColInd, + int64_t rows, int64_t cols, int64_t nnz, + IndexT* csrRowOffsets, IndexT* csrColInd, ValueT* csrValues); template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, - int64_t cols, - int64_t nnz, - int* csrRowOffsets, - int* csrColInd, - float* csrValues) -{ - return cusparseCreateCsr(spMatDescr, - rows, - cols, - nnz, - csrRowOffsets, - csrColInd, - csrValues, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, int64_t cols, + int64_t nnz, int* csrRowOffsets, + int* csrColInd, float* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, - int64_t cols, - int64_t nnz, - int* csrRowOffsets, - int* csrColInd, - double* csrValues) -{ - return cusparseCreateCsr(spMatDescr, - rows, - cols, - nnz, - csrRowOffsets, - csrColInd, - csrValues, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, int64_t cols, + int64_t nnz, int* csrRowOffsets, + int* csrColInd, double* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, - int64_t cols, - int64_t nnz, - int64_t* csrRowOffsets, + int64_t rows, int64_t cols, + int64_t nnz, int64_t* csrRowOffsets, int64_t* csrColInd, - float* csrValues) -{ - return cusparseCreateCsr(spMatDescr, - rows, - cols, - nnz, - csrRowOffsets, - csrColInd, - csrValues, - CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_BASE_ZERO, + float* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, - int64_t cols, - int64_t nnz, - int64_t* csrRowOffsets, + int64_t rows, int64_t cols, + int64_t nnz, int64_t* csrRowOffsets, int64_t* csrColInd, - double* csrValues) -{ - return cusparseCreateCsr(spMatDescr, - rows, - cols, - nnz, - csrRowOffsets, - csrColInd, - csrValues, - CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_BASE_ZERO, + double* csrValues) { + return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, + csrColInd, csrValues, CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } /** @} */ @@ -402,19 +292,16 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, * @{ */ template -cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, int64_t size, T* values); +cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, + int64_t size, T* values); template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, - float* values) -{ + int64_t size, float* values) { return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, - double* values) -{ + int64_t size, double* values) { return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F); } /** @} */ @@ -425,30 +312,23 @@ inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, */ template cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, - int64_t cols, - int64_t ld, - T* values, - cusparseOrder_t order); + int64_t rows, int64_t cols, int64_t ld, + T* values, cusparseOrder_t order); template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, - int64_t cols, - int64_t ld, - float* values, - cusparseOrder_t order) -{ - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, order); + int64_t rows, int64_t cols, + int64_t ld, float* values, + cusparseOrder_t order) { + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, + order); } template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, - int64_t cols, - int64_t ld, - double* values, - cusparseOrder_t order) -{ - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order); + int64_t rows, int64_t cols, + int64_t ld, double* values, + cusparseOrder_t order) { + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, + order); } /** @} */ @@ -457,89 +337,58 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, * @{ */ template -cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, - cusparseOperation_t opA, - const T* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, - const T* beta, - const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, - size_t* bufferSize, - cudaStream_t stream); +cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, - cusparseOperation_t opA, - const float* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, - const float* beta, - const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, - size_t* bufferSize, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize( - handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, bufferSize); + return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, + CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, - cusparseOperation_t opA, - const double* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, - const double* beta, - const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, - size_t* bufferSize, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsespmv_buffersize( + cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + size_t* bufferSize, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize( - handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, bufferSize); + return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, + CUDA_R_64F, alg, bufferSize); } template -cusparseStatus_t cusparsespmv(cusparseHandle_t handle, - cusparseOperation_t opA, - const T* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, - const T* beta, +cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, const T* beta, const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, - T* externalBuffer, + cusparseSpMVAlg_t alg, T* externalBuffer, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, - cusparseOperation_t opA, - const float* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, - const float* beta, - const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, - float* externalBuffer, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsespmv( + cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + float* externalBuffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, + alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, - cusparseOperation_t opA, - const double* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, - const double* beta, - const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, - double* externalBuffer, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsespmv( + cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, + const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, + const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + double* externalBuffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, + alg, externalBuffer); } /** @} */ #else @@ -549,59 +398,29 @@ inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, */ template cusparseStatus_t cusparsecsrmv( // NOLINT - cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const T* alpha, - const cusparseMatDescr_t descr, - const T* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const T* x, - const T* beta, - T* y, + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const float* alpha, - const cusparseMatDescr_t descr, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const float* beta, - float* y, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmv( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const float* alpha, const cusparseMatDescr_t descr, const float* csrVal, + const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta, + float* y, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmv( - handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); + return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, beta, y); } template <> -inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int nnz, - const double* alpha, - const cusparseMatDescr_t descr, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const double* beta, - double* y, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmv( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, + const double* alpha, const cusparseMatDescr_t descr, const double* csrVal, + const int* csrRowPtr, const int* csrColInd, const double* x, + const double* beta, double* y, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmv( - handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); + return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, beta, y); } /** @} */ #endif @@ -612,96 +431,58 @@ inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, * @{ */ template -cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, - cusparseOperation_t opA, - cusparseOperation_t opB, - const T* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, - const T* beta, - cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, - size_t* bufferSize, - cudaStream_t stream); +cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, - cusparseOperation_t opA, - cusparseOperation_t opB, - const float* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, - const float* beta, - cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, - size_t* bufferSize, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const float* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize( - handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, bufferSize); + return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, + matC, CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, - cusparseOperation_t opA, - cusparseOperation_t opB, - const double* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, - const double* beta, - cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, - size_t* bufferSize, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsespmm_bufferSize( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const double* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const double* beta, + cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize( - handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, bufferSize); + return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, + matC, CUDA_R_64F, alg, bufferSize); } template -inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, - cusparseOperation_t opA, - cusparseOperation_t opB, - const T* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, - const T* beta, - cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, - T* externalBuffer, - cudaStream_t stream); +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const T* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, - cusparseOperation_t opA, - cusparseOperation_t opB, - const float* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, - const float* beta, - cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, - float* externalBuffer, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const float* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM( - handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, externalBuffer); + return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, + CUDA_R_32F, alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, - cusparseOperation_t opA, - cusparseOperation_t opB, - const double* alpha, - const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, - const double* beta, - cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, - double* externalBuffer, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsespmm( + cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, + const double* alpha, const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, const double* beta, + cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM( - handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, externalBuffer); + return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, + CUDA_R_64F, alg, externalBuffer); } /** @} */ #else @@ -711,68 +492,31 @@ inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, */ template cusparseStatus_t cusparsecsrmm( // NOLINT - cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const T* alpha, - const cusparseMatDescr_t descr, - const T* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const T* x, - const int ldx, - const T* beta, - T* y, - const int ldy, - cudaStream_t stream); + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx, + const T* beta, T* y, const int ldy, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const float* alpha, - const cusparseMatDescr_t descr, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const float* x, - const int ldx, - const float* beta, - float* y, - const int ldy, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmm( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const float* alpha, const cusparseMatDescr_t descr, + const float* csrVal, const int* csrRowPtr, const int* csrColInd, + const float* x, const int ldx, const float* beta, float* y, const int ldy, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmm( - handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } template <> -inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - int n, - int k, - int nnz, - const double* alpha, - const cusparseMatDescr_t descr, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - const double* x, - const int ldx, - const double* beta, - double* y, - const int ldy, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrmm( + cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, + int nnz, const double* alpha, const cusparseMatDescr_t descr, + const double* csrVal, const int* csrRowPtr, const int* csrColInd, + const double* x, const int ldx, const double* beta, double* y, const int ldy, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmm( - handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, + csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } /** @} */ #endif @@ -783,22 +527,15 @@ inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, */ template void cusparsecsr2coo( // NOLINT - cusparseHandle_t handle, - const int n, - const int nnz, - const T* csrRowPtr, - T* cooRowInd, - cudaStream_t stream); + cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr, + T* cooRowInd, cudaStream_t stream); template <> -inline void cusparsecsr2coo(cusparseHandle_t handle, - const int n, - const int nnz, - const int* csrRowPtr, - int* cooRowInd, - cudaStream_t stream) -{ +inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, + const int* csrRowPtr, int* cooRowInd, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, + CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -816,8 +553,7 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, // template<> inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, cusparsePointerMode_t mode, - cudaStream_t stream) -{ + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSetPointerMode(handle, mode); } @@ -828,203 +564,69 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, * @{ */ template -cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, - cusparseAlgMode_t alg, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const T* alpha, - const cusparseMatDescr_t descrA, - const T* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const T* x, - const T* beta, - T* y, - size_t* bufferSizeInBytes, - cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, - cusparseAlgMode_t alg, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const float* alpha, - const cusparseMatDescr_t descrA, - const float* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const float* x, - const float* beta, - float* y, - size_t* bufferSizeInBytes, - cudaStream_t stream) -{ - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize(handle, - alg, - transA, - m, - n, - nnz, - alpha, - CUDA_R_32F, - descrA, - csrValA, - CUDA_R_32F, - csrRowPtrA, - csrColIndA, - x, - CUDA_R_32F, - beta, - CUDA_R_32F, - y, - CUDA_R_32F, - CUDA_R_32F, - bufferSizeInBytes); -} -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, - cusparseAlgMode_t alg, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const double* alpha, - const cusparseMatDescr_t descrA, - const double* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const double* x, - const double* beta, - double* y, - size_t* bufferSizeInBytes, - cudaStream_t stream) -{ - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize(handle, - alg, - transA, - m, - n, - nnz, - alpha, - CUDA_R_64F, - descrA, - csrValA, - CUDA_R_64F, - csrRowPtrA, - csrColIndA, - x, - CUDA_R_64F, - beta, - CUDA_R_64F, - y, - CUDA_R_64F, - CUDA_R_64F, - bufferSizeInBytes); +cusparseStatus_t cusparsecsrmvex_bufferSize( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, + const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, + const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, + const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, + const float* x, const float* beta, float* y, size_t* bufferSizeInBytes, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize( + handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA, + CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y, + CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes); +} +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, + const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, + const double* x, const double* beta, double* y, size_t* bufferSizeInBytes, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize( + handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA, + CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y, + CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes); } template -cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, - cusparseAlgMode_t alg, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const T* alpha, - const cusparseMatDescr_t descrA, - const T* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const T* x, - const T* beta, - T* y, - T* buffer, - cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, - cusparseAlgMode_t alg, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const float* alpha, - const cusparseMatDescr_t descrA, - const float* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const float* x, - const float* beta, - float* y, - float* buffer, - cudaStream_t stream) -{ - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, - alg, - transA, - m, - n, - nnz, - alpha, - CUDA_R_32F, - descrA, - csrValA, - CUDA_R_32F, - csrRowPtrA, - csrColIndA, - x, - CUDA_R_32F, - beta, - CUDA_R_32F, - y, - CUDA_R_32F, - CUDA_R_32F, - buffer); -} -template <> -inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, - cusparseAlgMode_t alg, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const double* alpha, - const cusparseMatDescr_t descrA, - const double* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const double* x, - const double* beta, - double* y, - double* buffer, - cudaStream_t stream) -{ - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, - alg, - transA, - m, - n, - nnz, - alpha, - CUDA_R_64F, - descrA, - csrValA, - CUDA_R_64F, - csrRowPtrA, - csrColIndA, - x, - CUDA_R_64F, - beta, - CUDA_R_64F, - y, - CUDA_R_64F, - CUDA_R_64F, - buffer); +cusparseStatus_t cusparsecsrmvex( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, + const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, + const T* beta, T* y, T* buffer, cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, + const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, + const float* x, const float* beta, float* y, float* buffer, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, + descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x, + CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F, + CUDA_R_32F, buffer); +} +template <> +inline cusparseStatus_t cusparsecsrmvex( + cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, + int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, + const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, + const double* x, const double* beta, double* y, double* buffer, + cudaStream_t stream) { + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, + descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x, + CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F, + CUDA_R_64F, buffer); } /** @} */ @@ -1035,180 +637,68 @@ inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, */ template -cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, - int m, - int n, - int nnz, - const T* csrVal, - const int* csrRowPtr, - const int* csrColInd, - void* cscVal, - int* cscColPtr, - int* cscRowInd, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, - size_t* bufferSize, - cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc_bufferSize( + cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, + int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, - int m, - int n, - int nnz, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - void* cscVal, - int* cscColPtr, - int* cscRowInd, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, - size_t* bufferSize, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsr2csc_bufferSize( + cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, + const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, + int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize(handle, - m, - n, - nnz, - csrVal, - csrRowPtr, - csrColInd, - cscVal, - cscColPtr, - cscRowInd, - CUDA_R_32F, - copyValues, - idxBase, - alg, - bufferSize); + return cusparseCsr2cscEx2_bufferSize( + handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, + cscRowInd, CUDA_R_32F, copyValues, idxBase, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, - int m, - int n, - int nnz, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - void* cscVal, - int* cscColPtr, - int* cscRowInd, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, - size_t* bufferSize, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsr2csc_bufferSize( + cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, + const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, + int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize(handle, - m, - n, - nnz, - csrVal, - csrRowPtr, - csrColInd, - cscVal, - cscColPtr, - cscRowInd, - CUDA_R_64F, - copyValues, - idxBase, - alg, - bufferSize); + return cusparseCsr2cscEx2_bufferSize( + handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, + cscRowInd, CUDA_R_64F, copyValues, idxBase, alg, bufferSize); } template -cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, - int m, - int n, - int nnz, - const T* csrVal, - const int* csrRowPtr, - const int* csrColInd, - void* cscVal, - int* cscColPtr, - int* cscRowInd, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, - void* buffer, - cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc( + cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, + const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, + int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, - int m, - int n, - int nnz, - const float* csrVal, - const int* csrRowPtr, - const int* csrColInd, - void* cscVal, - int* cscColPtr, - int* cscRowInd, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, - void* buffer, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsr2csc( + cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, + const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, + int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, - m, - n, - nnz, - csrVal, - csrRowPtr, - csrColInd, - cscVal, - cscColPtr, - cscRowInd, - CUDA_R_32F, - copyValues, - idxBase, - alg, - buffer); + return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, + cscVal, cscColPtr, cscRowInd, CUDA_R_32F, + copyValues, idxBase, alg, buffer); } template <> -inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, - int m, - int n, - int nnz, - const double* csrVal, - const int* csrRowPtr, - const int* csrColInd, - void* cscVal, - int* cscColPtr, - int* cscRowInd, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, - void* buffer, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsr2csc( + cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, + const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, + int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, - m, - n, - nnz, - csrVal, - csrRowPtr, - csrColInd, - cscVal, - cscColPtr, - cscRowInd, - CUDA_R_64F, - copyValues, - idxBase, - alg, - buffer); + return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, + cscVal, cscColPtr, cscRowInd, CUDA_R_64F, + copyValues, idxBase, alg, buffer); } /** @} */ @@ -1219,329 +709,120 @@ inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, */ template -cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, - int m, - int n, - int k, - const T* alpha, - const T* beta, - const cusparseMatDescr_t matA, - int nnzA, - const int* rowindA, - const int* indicesA, - const cusparseMatDescr_t matB, - int nnzB, - const int* rowindB, - const int* indicesB, - const cusparseMatDescr_t matD, - int nnzD, - const int* rowindD, - const int* indicesD, - csrgemm2Info_t info, - size_t* pBufferSizeInBytes, - cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, - int m, - int n, - int k, - const float* alpha, - const float* beta, - const cusparseMatDescr_t matA, - int nnzA, - const int* rowindA, - const int* indicesA, - const cusparseMatDescr_t matB, - int nnzB, - const int* rowindB, - const int* indicesB, - const cusparseMatDescr_t matD, - int nnzD, - const int* rowindD, - const int* indicesD, - csrgemm2Info_t info, - size_t* pBufferSizeInBytes, - cudaStream_t stream) -{ +cusparseStatus_t cusparsecsrgemm2_buffersizeext( + cusparseHandle_t handle, int m, int n, int k, const T* alpha, const T* beta, + const cusparseMatDescr_t matA, int nnzA, const int* rowindA, + const int* indicesA, const cusparseMatDescr_t matB, int nnzB, + const int* rowindB, const int* indicesB, const cusparseMatDescr_t matD, + int nnzD, const int* rowindD, const int* indicesD, csrgemm2Info_t info, + size_t* pBufferSizeInBytes, cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( + cusparseHandle_t handle, int m, int n, int k, const float* alpha, + const float* beta, const cusparseMatDescr_t matA, int nnzA, + const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, + int nnzB, const int* rowindB, const int* indicesB, + const cusparseMatDescr_t matD, int nnzD, const int* rowindD, + const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2_bufferSizeExt(handle, - m, - n, - k, - alpha, - matA, - nnzA, - rowindA, - indicesA, - matB, - nnzB, - rowindB, - indicesB, - beta, - matD, - nnzD, - rowindD, - indicesD, - info, - pBufferSizeInBytes); + return cusparseScsrgemm2_bufferSizeExt( + handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, + indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, - int m, - int n, - int k, - const double* alpha, - const double* beta, - const cusparseMatDescr_t matA, - int nnzA, - const int* rowindA, - const int* indicesA, - const cusparseMatDescr_t matB, - int nnzB, - const int* rowindB, - const int* indicesB, - const cusparseMatDescr_t matD, - int nnzD, - const int* rowindD, - const int* indicesD, - csrgemm2Info_t info, - size_t* pBufferSizeInBytes, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( + cusparseHandle_t handle, int m, int n, int k, const double* alpha, + const double* beta, const cusparseMatDescr_t matA, int nnzA, + const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, + int nnzB, const int* rowindB, const int* indicesB, + const cusparseMatDescr_t matD, int nnzD, const int* rowindD, + const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2_bufferSizeExt(handle, - m, - n, - k, - alpha, - matA, - nnzA, - rowindA, - indicesA, - matB, - nnzB, - rowindB, - indicesB, - beta, - matD, - nnzD, - rowindD, - indicesD, - info, - pBufferSizeInBytes); + return cusparseDcsrgemm2_bufferSizeExt( + handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, + indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); #pragma GCC diagnostic pop } -inline cusparseStatus_t cusparsecsrgemm2nnz(cusparseHandle_t handle, - int m, - int n, - int k, - const cusparseMatDescr_t matA, - int nnzA, - const int* rowindA, - const int* indicesA, - const cusparseMatDescr_t matB, - int nnzB, - const int* rowindB, - const int* indicesB, - const cusparseMatDescr_t matD, - int nnzD, - const int* rowindD, - const int* indicesD, - const cusparseMatDescr_t matC, - int* rowindC, - int* nnzC, - const csrgemm2Info_t info, - void* pBuffer, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrgemm2nnz( + cusparseHandle_t handle, int m, int n, int k, const cusparseMatDescr_t matA, + int nnzA, const int* rowindA, const int* indicesA, + const cusparseMatDescr_t matB, int nnzB, const int* rowindB, + const int* indicesB, const cusparseMatDescr_t matD, int nnzD, + const int* rowindD, const int* indicesD, const cusparseMatDescr_t matC, + int* rowindC, int* nnzC, const csrgemm2Info_t info, void* pBuffer, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseXcsrgemm2Nnz(handle, - m, - n, - k, - matA, - nnzA, - rowindA, - indicesA, - matB, - nnzB, - rowindB, - indicesB, - matD, - nnzD, - rowindD, - indicesD, - matC, - rowindC, - nnzC, - info, + return cusparseXcsrgemm2Nnz(handle, m, n, k, matA, nnzA, rowindA, indicesA, + matB, nnzB, rowindB, indicesB, matD, nnzD, + rowindD, indicesD, matC, rowindC, nnzC, info, pBuffer); #pragma GCC diagnostic pop } template -cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, - int m, - int n, - int k, - const T* alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const T* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const T* csrValB, - const int* csrRowPtrB, - const int* csrColIndB, - const T* beta, - const cusparseMatDescr_t descrD, - int nnzD, - const T* csrValD, - const int* csrRowPtrD, - const int* csrColIndD, - const cusparseMatDescr_t descrC, - T* csrValC, - const int* csrRowPtrC, - int* csrColIndC, - const csrgemm2Info_t info, - void* pBuffer, - cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, - int m, - int n, - int k, - const float* alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const float* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const float* csrValB, - const int* csrRowPtrB, - const int* csrColIndB, - const float* beta, - const cusparseMatDescr_t descrD, - int nnzD, - const float* csrValD, - const int* csrRowPtrD, - const int* csrColIndD, - const cusparseMatDescr_t descrC, - float* csrValC, - const int* csrRowPtrC, - int* csrColIndC, - const csrgemm2Info_t info, - void* pBuffer, - cudaStream_t stream) -{ +cusparseStatus_t cusparsecsrgemm2( + cusparseHandle_t handle, int m, int n, int k, const T* alpha, + const cusparseMatDescr_t descrA, int nnzA, const T* csrValA, + const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, + int nnzB, const T* csrValB, const int* csrRowPtrB, const int* csrColIndB, + const T* beta, const cusparseMatDescr_t descrD, int nnzD, const T* csrValD, + const int* csrRowPtrD, const int* csrColIndD, const cusparseMatDescr_t descrC, + T* csrValC, const int* csrRowPtrC, int* csrColIndC, const csrgemm2Info_t info, + void* pBuffer, cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2( + cusparseHandle_t handle, int m, int n, int k, const float* alpha, + const cusparseMatDescr_t descrA, int nnzA, const float* csrValA, + const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, + int nnzB, const float* csrValB, const int* csrRowPtrB, const int* csrColIndB, + const float* beta, const cusparseMatDescr_t descrD, int nnzD, + const float* csrValD, const int* csrRowPtrD, const int* csrColIndD, + const cusparseMatDescr_t descrC, float* csrValC, const int* csrRowPtrC, + int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2(handle, - m, - n, - k, - alpha, - descrA, - nnzA, - csrValA, - csrRowPtrA, - csrColIndA, - descrB, - nnzB, - csrValB, - csrRowPtrB, - csrColIndB, - beta, - descrD, - nnzD, - csrValD, - csrRowPtrD, - csrColIndD, - descrC, - csrValC, - csrRowPtrC, - csrColIndC, - info, - pBuffer); + return cusparseScsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, + csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, + csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, + csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, + csrColIndC, info, pBuffer); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, - int m, - int n, - int k, - const double* alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const double* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const double* csrValB, - const int* csrRowPtrB, - const int* csrColIndB, - const double* beta, - const cusparseMatDescr_t descrD, - int nnzD, - const double* csrValD, - const int* csrRowPtrD, - const int* csrColIndD, - const cusparseMatDescr_t descrC, - double* csrValC, - const int* csrRowPtrC, - int* csrColIndC, - const csrgemm2Info_t info, - void* pBuffer, - cudaStream_t stream) -{ +inline cusparseStatus_t cusparsecsrgemm2( + cusparseHandle_t handle, int m, int n, int k, const double* alpha, + const cusparseMatDescr_t descrA, int nnzA, const double* csrValA, + const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, + int nnzB, const double* csrValB, const int* csrRowPtrB, const int* csrColIndB, + const double* beta, const cusparseMatDescr_t descrD, int nnzD, + const double* csrValD, const int* csrRowPtrD, const int* csrColIndD, + const cusparseMatDescr_t descrC, double* csrValC, const int* csrRowPtrC, + int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, + cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2(handle, - m, - n, - k, - alpha, - descrA, - nnzA, - csrValA, - csrRowPtrA, - csrColIndA, - descrB, - nnzB, - csrValB, - csrRowPtrB, - csrColIndB, - beta, - descrD, - nnzD, - csrValD, - csrRowPtrD, - csrColIndD, - descrC, - csrValC, - csrRowPtrC, - csrColIndC, - info, - pBuffer); + return cusparseDcsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, + csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, + csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, + csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, + csrColIndC, info, pBuffer); #pragma GCC diagnostic pop } @@ -1553,46 +834,33 @@ inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, */ template -cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, - int m, - int n, +cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA, - const T* csrValA, - const int* csrRowPtrA, - const int* csrColIndA, - T* A, - int lda, + const T* csrValA, const int* csrRowPtrA, + const int* csrColIndA, T* A, int lda, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, - int m, - int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA, const float* csrValA, const int* csrRowPtrA, - const int* csrColIndA, - float* A, - int lda, - cudaStream_t stream) -{ + const int* csrColIndA, float* A, + int lda, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); + return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, + csrColIndA, A, lda); } template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, - int m, - int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA, const double* csrValA, const int* csrRowPtrA, - const int* csrColIndA, - double* A, - int lda, - cudaStream_t stream) -{ + const int* csrColIndA, double* A, + int lda, cudaStream_t stream) { CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); + return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, + csrColIndA, A, lda); } /** @} */ diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh index aef19122da..f3109556b7 100644 --- a/cpp/include/raft/sparse/distance/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/bin_distance.cuh @@ -37,11 +37,9 @@ namespace distance { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_binary_row_norm_kernel(value_t* out, - const value_idx* __restrict__ coo_rows, - const value_t* __restrict__ data, - value_idx nnz) -{ +__global__ void compute_binary_row_norm_kernel( + value_t *out, const value_idx *__restrict__ coo_rows, + const value_t *__restrict__ data, value_idx nnz) { value_idx i = blockDim.x * blockIdx.x + threadIdx.x; if (i < nnz) { // We do conditional here only because it's @@ -53,64 +51,55 @@ __global__ void compute_binary_row_norm_kernel(value_t* out, } template -__global__ void compute_binary_warp_kernel(value_t* __restrict__ C, - const value_t* __restrict__ Q_norms, - const value_t* __restrict__ R_norms, - value_idx n_rows, - value_idx n_cols, - expansion_f expansion_func) -{ +__global__ void compute_binary_warp_kernel(value_t *__restrict__ C, + const value_t *__restrict__ Q_norms, + const value_t *__restrict__ R_norms, + value_idx n_rows, value_idx n_cols, + expansion_f expansion_func) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t q_norm = Q_norms[i]; - value_t r_norm = R_norms[j]; - value_t dot = C[(size_t)i * n_cols + j]; + value_t q_norm = Q_norms[i]; + value_t r_norm = R_norms[j]; + value_t dot = C[(size_t)i * n_cols + j]; C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm); } -template -void compute_binary(value_t* C, - const value_t* Q_norms, - const value_t* R_norms, - value_idx n_rows, - value_idx n_cols, - expansion_f expansion_func, - cudaStream_t stream) -{ +template +void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms, + value_idx n_rows, value_idx n_cols, + expansion_f expansion_func, cudaStream_t stream) { int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_binary_warp_kernel<<>>( C, Q_norms, R_norms, n_rows, n_cols, expansion_func); } -template -void compute_bin_distance(value_t* out, - const value_idx* Q_coo_rows, - const value_t* Q_data, - value_idx Q_nnz, - const value_idx* R_coo_rows, - const value_t* R_data, - value_idx R_nnz, - value_idx m, - value_idx n, +template +void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, + const value_t *Q_data, value_idx Q_nnz, + const value_idx *R_coo_rows, const value_t *R_data, + value_idx R_nnz, value_idx m, value_idx n, std::shared_ptr alloc, - cudaStream_t stream, - expansion_f expansion_func) -{ + cudaStream_t stream, expansion_f expansion_func) { rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_binary_row_norm_kernel<<>>( Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_binary_row_norm_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream); + compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, + stream); } /** @@ -120,52 +109,44 @@ void compute_bin_distance(value_t* out, template class jaccard_expanded_distances_t : public distances_t { public: - explicit jaccard_expanded_distances_t(const distances_config_t& config) - : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) - { - } + explicit jaccard_expanded_distances_t( + const distances_config_t &config) + : config_(&config), + workspace(0, config.handle.get_stream()), + ip_dists(config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { ip_dists.compute(out_dists); - value_idx* b_indices = ip_dists.b_rows_coo(); - value_t* b_data = ip_dists.b_data_coo(); + value_idx *b_indices = ip_dists.b_rows_coo(); + value_t *b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, - config_->a_nrows, - search_coo_rows.data(), - config_->a_nnz, + rmm::device_uvector search_coo_rows( + config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + search_coo_rows.data(), config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance(out_dists, - search_coo_rows.data(), - config_->a_data, - config_->a_nnz, - b_indices, - b_data, - config_->b_nnz, - config_->a_nrows, - config_->b_nrows, - config_->handle.get_device_allocator(), - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t denom = q_r_union - dot; - - value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); - - // flip the similarity when both rows are 0 - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * jacc) + both_empty); - }); + compute_bin_distance( + out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, + b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, + config_->handle.get_device_allocator(), config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t denom = q_r_union - dot; + + value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); + + // flip the similarity when both rows are 0 + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * jacc) + both_empty); + }); } ~jaccard_expanded_distances_t() = default; private: - const distances_config_t* config_; + const distances_config_t *config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -177,48 +158,40 @@ class jaccard_expanded_distances_t : public distances_t { template class dice_expanded_distances_t : public distances_t { public: - explicit dice_expanded_distances_t(const distances_config_t& config) - : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) - { - } + explicit dice_expanded_distances_t( + const distances_config_t &config) + : config_(&config), + workspace(0, config.handle.get_stream()), + ip_dists(config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { ip_dists.compute(out_dists); - value_idx* b_indices = ip_dists.b_rows_coo(); - value_t* b_data = ip_dists.b_data_coo(); + value_idx *b_indices = ip_dists.b_rows_coo(); + value_t *b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, - config_->a_nrows, - search_coo_rows.data(), - config_->a_nnz, + rmm::device_uvector search_coo_rows( + config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + search_coo_rows.data(), config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance(out_dists, - search_coo_rows.data(), - config_->a_data, - config_->a_nnz, - b_indices, - b_data, - config_->b_nnz, - config_->a_nrows, - config_->b_nrows, - config_->handle.get_device_allocator(), - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t dice = (2 * dot) / q_r_union; - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * dice) + both_empty); - }); + compute_bin_distance( + out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, + b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, + config_->handle.get_device_allocator(), config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t dice = (2 * dot) / q_r_union; + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * dice) + both_empty); + }); } ~dice_expanded_distances_t() = default; private: - const distances_config_t* config_; + const distances_config_t *config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h index 29c823bcdb..1c55412eec 100644 --- a/cpp/include/raft/sparse/distance/common.h +++ b/cpp/include/raft/sparse/distance/common.h @@ -24,31 +24,31 @@ namespace distance { template struct distances_config_t { - distances_config_t(const raft::handle_t& handle_) : handle(handle_) {} + distances_config_t(const raft::handle_t &handle_) : handle(handle_) {} // left side value_idx a_nrows; value_idx a_ncols; value_idx a_nnz; - value_idx* a_indptr; - value_idx* a_indices; - value_t* a_data; + value_idx *a_indptr; + value_idx *a_indices; + value_t *a_data; // right side value_idx b_nrows; value_idx b_ncols; value_idx b_nnz; - value_idx* b_indptr; - value_idx* b_indices; - value_t* b_data; + value_idx *b_indptr; + value_idx *b_indices; + value_t *b_data; - const raft::handle_t& handle; + const raft::handle_t &handle; }; template class distances_t { public: - virtual void compute(value_t* out) {} + virtual void compute(value_t *out) {} virtual ~distances_t() = default; }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh index cdf1be0c68..3a78f9ada0 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh @@ -41,29 +41,19 @@ namespace raft { namespace sparse { namespace distance { -template inline void balanced_coo_pairwise_generalized_spmv( - value_t* out_dists, - const distances_config_t& config_, - value_idx* coo_rows_b, - product_f product_func, - accum_f accum_func, - write_f write_func, - strategy_t strategy, - int chunk_size = 500000) -{ - CUDA_CHECK(cudaMemsetAsync(out_dists, - 0, - sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); - - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); + value_t *out_dists, const distances_config_t &config_, + value_idx *coo_rows_b, product_f product_func, accum_f accum_func, + write_f write_func, strategy_t strategy, int chunk_size = 500000) { + CUDA_CHECK(cudaMemsetAsync( + out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); + + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, + chunk_size); }; /** @@ -99,55 +89,39 @@ inline void balanced_coo_pairwise_generalized_spmv( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv( - value_t* out_dists, - const distances_config_t& config_, - value_idx* coo_rows_b, - product_f product_func, - accum_f accum_func, - write_f write_func, - int chunk_size = 500000) -{ - CUDA_CHECK(cudaMemsetAsync(out_dists, - 0, - sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); + value_t *out_dists, const distances_config_t &config_, + value_idx *coo_rows_b, product_f product_func, accum_f accum_func, + write_f write_func, int chunk_size = 500000) { + CUDA_CHECK(cudaMemsetAsync( + out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); int max_cols = max_cols_per_block(); if (max_cols > config_.a_ncols) { - dense_smem_strategy strategy(config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); + dense_smem_strategy strategy( + config_); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, + write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, + write_func, chunk_size); } }; -template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t* out_dists, - const distances_config_t& config_, - value_idx* coo_rows_a, - product_f product_func, - accum_f accum_func, - write_f write_func, - strategy_t strategy, - int chunk_size = 500000) -{ - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); + value_t *out_dists, const distances_config_t &config_, + value_idx *coo_rows_a, product_f product_func, accum_f accum_func, + write_f write_func, strategy_t strategy, int chunk_size = 500000) { + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, + write_func, chunk_size); }; /** @@ -186,30 +160,24 @@ inline void balanced_coo_pairwise_generalized_spmv_rev( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t* out_dists, - const distances_config_t& config_, - value_idx* coo_rows_a, - product_f product_func, - accum_f accum_func, - write_f write_func, - int chunk_size = 500000) -{ + value_t *out_dists, const distances_config_t &config_, + value_idx *coo_rows_a, product_f product_func, accum_f accum_func, + write_f write_func, int chunk_size = 500000) { // try dense first int max_cols = max_cols_per_block(); if (max_cols > config_.b_ncols) { - dense_smem_strategy strategy(config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); + dense_smem_strategy strategy( + config_); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, + write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, + write_func, chunk_size); } }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh index 7a83e73183..5ace978a23 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh @@ -32,114 +32,58 @@ namespace distance { template class coo_spmv_strategy { public: - coo_spmv_strategy(const distances_config_t& config_) : config(config_) - { + coo_spmv_strategy(const distances_config_t &config_) + : config(config_) { smem = raft::getSharedMemPerBlock(); } - template - void _dispatch_base(strategy_t& strategy, - int smem_dim, - indptr_it& a_indptr, - value_t* out_dists, - value_idx* coo_rows_b, - product_f product_func, - accum_f accum_func, - write_f write_func, - int chunk_size, - int n_blocks, - int n_blocks_per_row) - { - CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base(strategy_t &strategy, int smem_dim, indptr_it &a_indptr, + value_t *out_dists, value_idx *coo_rows_b, + product_f product_func, accum_f accum_func, + write_f write_func, int chunk_size, int n_blocks, + int n_blocks_per_row) { + CUDA_CHECK(cudaFuncSetCacheConfig( + balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>(strategy, - a_indptr, - config.a_indices, - config.a_data, - config.a_nnz, - coo_rows_b, - config.b_indices, - config.b_data, - config.a_nrows, - config.b_nrows, - smem_dim, - config.b_nnz, - out_dists, - n_blocks_per_row, - chunk_size, - config.b_ncols, - product_func, - accum_func, - write_func); + balanced_coo_generalized_spmv_kernel + <<>>( + strategy, a_indptr, config.a_indices, config.a_data, config.a_nnz, + coo_rows_b, config.b_indices, config.b_data, config.a_nrows, + config.b_nrows, smem_dim, config.b_nnz, out_dists, n_blocks_per_row, + chunk_size, config.b_ncols, product_func, accum_func, write_func); } - template - void _dispatch_base_rev(strategy_t& strategy, - int smem_dim, - indptr_it& b_indptr, - value_t* out_dists, - value_idx* coo_rows_a, - product_f product_func, - accum_f accum_func, - write_f write_func, - int chunk_size, - int n_blocks, - int n_blocks_per_row) - { - CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base_rev(strategy_t &strategy, int smem_dim, + indptr_it &b_indptr, value_t *out_dists, + value_idx *coo_rows_a, product_f product_func, + accum_f accum_func, write_f write_func, + int chunk_size, int n_blocks, int n_blocks_per_row) { + CUDA_CHECK(cudaFuncSetCacheConfig( + balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>(strategy, - b_indptr, - config.b_indices, - config.b_data, - config.b_nnz, - coo_rows_a, - config.a_indices, - config.a_data, - config.b_nrows, - config.a_nrows, - smem_dim, - config.a_nnz, - out_dists, - n_blocks_per_row, - chunk_size, - config.a_ncols, - product_func, - accum_func, - write_func); + balanced_coo_generalized_spmv_kernel + <<>>( + strategy, b_indptr, config.b_indices, config.b_data, config.b_nnz, + coo_rows_a, config.a_indices, config.a_data, config.b_nrows, + config.a_nrows, smem_dim, config.a_nnz, out_dists, n_blocks_per_row, + chunk_size, config.a_ncols, product_func, accum_func, write_func); } protected: int smem; - const distances_config_t& config; + const distances_config_t &config; }; } // namespace distance diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh index 6586067b56..44c3833f96 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -29,15 +29,11 @@ namespace distance { template class mask_row_it { public: - mask_row_it(const value_idx* full_indptr_, - const value_idx& n_rows_, - value_idx* mask_row_idx_ = NULL) - : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) - { - } + mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, + value_idx *mask_row_idx_ = NULL) + : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) {} - __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) - { + __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { if (mask_row_idx != NULL) { return mask_row_idx[blockIdx.x / n_blocks_nnz_b]; } else { @@ -45,49 +41,37 @@ class mask_row_it { } } - __device__ inline void get_row_offsets(const value_idx& row_idx, - value_idx& start_offset, - value_idx& stop_offset, - const value_idx& n_blocks_nnz_b, - bool& first_a_chunk, - bool& last_a_chunk) - { + __device__ inline void get_row_offsets( + const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, + const value_idx &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { start_offset = full_indptr[row_idx]; - stop_offset = full_indptr[row_idx + 1] - 1; + stop_offset = full_indptr[row_idx + 1] - 1; } - __device__ constexpr inline void get_indices_boundary(const value_idx* indices, - value_idx& indices_len, - value_idx& start_offset, - value_idx& stop_offset, - value_idx& start_index, - value_idx& stop_index, - bool& first_a_chunk, - bool& last_a_chunk) - { + __device__ constexpr inline void get_indices_boundary( + const value_idx *indices, value_idx &indices_len, value_idx &start_offset, + value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, + bool &first_a_chunk, bool &last_a_chunk) { // do nothing; } - __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a, - value_idx& stop_index_a, - value_idx& index_b) - { + __device__ constexpr inline bool check_indices_bounds( + value_idx &start_index_a, value_idx &stop_index_a, value_idx &index_b) { return true; } const value_idx *full_indptr, &n_rows; - value_idx* mask_row_idx; + value_idx *mask_row_idx; }; template -__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row, - value_idx* chunk_indices, - value_idx n_rows) -{ +__global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row, + value_idx *chunk_indices, + value_idx n_rows) { auto tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n_rows) { auto start = n_chunks_per_row[tid]; - auto end = n_chunks_per_row[tid + 1]; + auto end = n_chunks_per_row[tid + 1]; #pragma unroll for (int i = start; i < end; i++) { @@ -99,89 +83,73 @@ __global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row, template class chunked_mask_row_it : public mask_row_it { public: - chunked_mask_row_it(const value_idx* full_indptr_, - const value_idx& n_rows_, - value_idx* mask_row_idx_, - int row_chunk_size_, - const value_idx* n_chunks_per_row_, - const value_idx* chunk_indices_, + chunked_mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, + value_idx *mask_row_idx_, int row_chunk_size_, + const value_idx *n_chunks_per_row_, + const value_idx *chunk_indices_, const cudaStream_t stream_) : mask_row_it(full_indptr_, n_rows_, mask_row_idx_), row_chunk_size(row_chunk_size_), n_chunks_per_row(n_chunks_per_row_), chunk_indices(chunk_indices_), - stream(stream_) - { - } + stream(stream_) {} - static void init(const value_idx* indptr, - const value_idx* mask_row_idx, - const value_idx& n_rows, - const int row_chunk_size, - rmm::device_uvector& n_chunks_per_row, - rmm::device_uvector& chunk_indices, - cudaStream_t stream) - { + static void init(const value_idx *indptr, const value_idx *mask_row_idx, + const value_idx &n_rows, const int row_chunk_size, + rmm::device_uvector &n_chunks_per_row, + rmm::device_uvector &chunk_indices, + cudaStream_t stream) { auto policy = rmm::exec_policy(stream); constexpr value_idx first_element = 0; n_chunks_per_row.set_element_async(0, first_element, stream); n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size); - thrust::transform( - policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor); + thrust::transform(policy, mask_row_idx, mask_row_idx + n_rows, + n_chunks_per_row.begin() + 1, chunk_functor); - thrust::inclusive_scan( - policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1); + thrust::inclusive_scan(policy, n_chunks_per_row.begin() + 1, + n_chunks_per_row.end(), + n_chunks_per_row.begin() + 1); - raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream); + raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, + stream); fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream); } - __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) - { + __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]]; } - __device__ inline void get_row_offsets(const value_idx& row_idx, - value_idx& start_offset, - value_idx& stop_offset, - const int& n_blocks_nnz_b, - bool& first_a_chunk, - bool& last_a_chunk) - { - auto chunk_index = blockIdx.x / n_blocks_nnz_b; - auto chunk_val = chunk_indices[chunk_index]; - auto prev_n_chunks = n_chunks_per_row[chunk_val]; + __device__ inline void get_row_offsets( + const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, + const int &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { + auto chunk_index = blockIdx.x / n_blocks_nnz_b; + auto chunk_val = chunk_indices[chunk_index]; + auto prev_n_chunks = n_chunks_per_row[chunk_val]; auto relative_chunk = chunk_index - prev_n_chunks; - first_a_chunk = relative_chunk == 0; + first_a_chunk = relative_chunk == 0; start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size; - stop_offset = start_offset + row_chunk_size; + stop_offset = start_offset + row_chunk_size; auto final_stop_offset = this->full_indptr[row_idx + 1]; last_a_chunk = stop_offset >= final_stop_offset; - stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; + stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; } - __device__ inline void get_indices_boundary(const value_idx* indices, - value_idx& row_idx, - value_idx& start_offset, - value_idx& stop_offset, - value_idx& start_index, - value_idx& stop_index, - bool& first_a_chunk, - bool& last_a_chunk) - { + __device__ inline void get_indices_boundary( + const value_idx *indices, value_idx &row_idx, value_idx &start_offset, + value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, + bool &first_a_chunk, bool &last_a_chunk) { start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1; - stop_index = last_a_chunk ? stop_index : indices[stop_offset]; + stop_index = last_a_chunk ? stop_index : indices[stop_offset]; } - __device__ inline bool check_indices_bounds(value_idx& start_index_a, - value_idx& stop_index_a, - value_idx& index_b) - { + __device__ inline bool check_indices_bounds(value_idx &start_index_a, + value_idx &stop_index_a, + value_idx &index_b) { return (index_b >= start_index_a && index_b <= stop_index_a); } @@ -192,34 +160,30 @@ class chunked_mask_row_it : public mask_row_it { struct n_chunks_per_row_functor { public: - n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_) - : indptr(indptr_), row_chunk_size(row_chunk_size_) - { - } + n_chunks_per_row_functor(const value_idx *indptr_, + value_idx row_chunk_size_) + : indptr(indptr_), row_chunk_size(row_chunk_size_) {} - __host__ __device__ value_idx operator()(const value_idx& i) - { + __host__ __device__ value_idx operator()(const value_idx &i) { auto degree = indptr[i + 1] - indptr[i]; return raft::ceildiv(degree, (value_idx)row_chunk_size); } - const value_idx* indptr; + const value_idx *indptr; value_idx row_chunk_size; }; private: - static void fill_chunk_indices(const value_idx& n_rows, - rmm::device_uvector& n_chunks_per_row, - rmm::device_uvector& chunk_indices, - cudaStream_t stream) - { + static void fill_chunk_indices( + const value_idx &n_rows, rmm::device_uvector &n_chunks_per_row, + rmm::device_uvector &chunk_indices, cudaStream_t stream) { auto n_threads = std::min(n_rows, 256); - auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); + auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); chunk_indices.resize(total_row_blocks, stream); - fill_chunk_indices_kernel - <<>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows); + fill_chunk_indices_kernel<<>>( + n_chunks_per_row.data(), chunk_indices.data(), n_rows); } }; diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh index aac98d6b02..c463654a3b 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh @@ -25,91 +25,71 @@ namespace distance { template class dense_smem_strategy : public coo_spmv_strategy { public: - using smem_type = value_t*; + using smem_type = value_t *; using insert_type = smem_type; - using find_type = smem_type; + using find_type = smem_type; - dense_smem_strategy(const distances_config_t& config_) - : coo_spmv_strategy(config_) - { - } + dense_smem_strategy(const distances_config_t &config_) + : coo_spmv_strategy(config_) {} - inline static int smem_per_block(int n_cols) - { - return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t)); + inline static int smem_per_block(int n_cols) { + return (n_cols * sizeof(value_t)) + + ((1024 / raft::warp_size()) * sizeof(value_t)); } template - void dispatch(value_t* out_dists, - value_idx* coo_rows_b, - product_f product_func, - accum_f accum_func, - write_f write_func, - int chunk_size) - { - auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024); - auto n_blocks = this->config.a_nrows * n_blocks_per_row; - - mask_row_it a_indptr(this->config.a_indptr, this->config.a_nrows); - - this->_dispatch_base(*this, - this->config.b_ncols, - a_indptr, - out_dists, - coo_rows_b, - product_func, - accum_func, - write_func, - chunk_size, - n_blocks, - n_blocks_per_row); + void dispatch(value_t *out_dists, value_idx *coo_rows_b, + product_f product_func, accum_f accum_func, write_f write_func, + int chunk_size) { + auto n_blocks_per_row = + raft::ceildiv(this->config.b_nnz, chunk_size * 1024); + auto n_blocks = this->config.a_nrows * n_blocks_per_row; + + mask_row_it a_indptr(this->config.a_indptr, + this->config.a_nrows); + + this->_dispatch_base(*this, this->config.b_ncols, a_indptr, out_dists, + coo_rows_b, product_func, accum_func, write_func, + chunk_size, n_blocks, n_blocks_per_row); } template - void dispatch_rev(value_t* out_dists, - value_idx* coo_rows_a, - product_f product_func, - accum_f accum_func, - write_f write_func, - int chunk_size) - { - auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024); - auto n_blocks = this->config.b_nrows * n_blocks_per_row; - - mask_row_it b_indptr(this->config.b_indptr, this->config.b_nrows); - - this->_dispatch_base_rev(*this, - this->config.a_ncols, - b_indptr, - out_dists, - coo_rows_a, - product_func, - accum_func, - write_func, - chunk_size, - n_blocks, - n_blocks_per_row); + void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, + product_f product_func, accum_f accum_func, + write_f write_func, int chunk_size) { + auto n_blocks_per_row = + raft::ceildiv(this->config.a_nnz, chunk_size * 1024); + auto n_blocks = this->config.b_nrows * n_blocks_per_row; + + mask_row_it b_indptr(this->config.b_indptr, + this->config.b_nrows); + + this->_dispatch_base_rev(*this, this->config.a_ncols, b_indptr, out_dists, + coo_rows_a, product_func, accum_func, write_func, + chunk_size, n_blocks, n_blocks_per_row); } - __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) - { + __device__ inline insert_type init_insert(smem_type cache, + const value_idx &cache_size) { for (int k = threadIdx.x; k < cache_size; k += blockDim.x) { cache[k] = 0.0; } return cache; } - __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) - { + __device__ inline void insert(insert_type cache, const value_idx &key, + const value_t &value) { cache[key] = value; } - __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) - { + __device__ inline find_type init_find(smem_type cache, + const value_idx &cache_size) { return cache; } - __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; } + __device__ inline value_t find(find_type cache, const value_idx &key) { + return cache[key]; + } }; } // namespace distance diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh index 3f8f4b21ad..1295d24103 100644 --- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh +++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh @@ -1,18 +1,18 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once @@ -38,238 +38,177 @@ template class hash_strategy : public coo_spmv_strategy { public: using insert_type = - typename cuco::static_map::device_mutable_view; - using smem_type = typename insert_type::slot_type*; + typename cuco::static_map::device_mutable_view; + using smem_type = typename insert_type::slot_type *; using find_type = - typename cuco::static_map::device_view; + typename cuco::static_map::device_view; - hash_strategy(const distances_config_t& config_, - float capacity_threshold_ = 0.5, - int map_size_ = get_map_size()) + hash_strategy(const distances_config_t &config_, + float capacity_threshold_ = 0.5, int map_size_ = get_map_size()) : coo_spmv_strategy(config_), capacity_threshold(capacity_threshold_), - map_size(map_size_) - { - } + map_size(map_size_) {} - void chunking_needed(const value_idx* indptr, - const value_idx n_rows, - rmm::device_uvector& mask_indptr, - std::tuple& n_rows_divided, - cudaStream_t stream) - { + void chunking_needed(const value_idx *indptr, const value_idx n_rows, + rmm::device_uvector &mask_indptr, + std::tuple &n_rows_divided, + cudaStream_t stream) { auto policy = rmm::exec_policy(stream); - auto less = thrust::copy_if(policy, - thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), - mask_indptr.data(), - fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); + auto less = thrust::copy_if( + policy, thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), mask_indptr.data(), + fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); std::get<0>(n_rows_divided) = less - mask_indptr.data(); auto more = thrust::copy_if( - policy, - thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), - less, - fits_in_hash_table( - indptr, capacity_threshold * map_size, std::numeric_limits::max())); + policy, thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), less, + fits_in_hash_table(indptr, capacity_threshold * map_size, + std::numeric_limits::max())); std::get<1>(n_rows_divided) = more - less; } template - void dispatch(value_t* out_dists, - value_idx* coo_rows_b, - product_f product_func, - accum_f accum_func, - write_f write_func, - int chunk_size) - { + void dispatch(value_t *out_dists, value_idx *coo_rows_b, + product_f product_func, accum_f accum_func, write_f write_func, + int chunk_size) { auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr(this->config.a_nrows, - this->config.handle.get_stream()); + rmm::device_uvector mask_indptr( + this->config.a_nrows, this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.a_indptr, - this->config.a_nrows, - mask_indptr, - n_rows_divided, - this->config.handle.get_stream()); + chunking_needed(this->config.a_indptr, this->config.a_nrows, mask_indptr, + n_rows_divided, this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.a_indptr, less_rows, mask_indptr.data()); + mask_row_it less(this->config.a_indptr, less_rows, + mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base(*this, - map_size, - less, - out_dists, - coo_rows_b, - product_func, - accum_func, - write_func, - chunk_size, - n_less_blocks, - n_blocks_per_row); + this->_dispatch_base(*this, map_size, less, out_dists, coo_rows_b, + product_func, accum_func, write_func, chunk_size, + n_less_blocks, n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row(more_rows + 1, - this->config.handle.get_stream()); - rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); - chunked_mask_row_it::init(this->config.a_indptr, - mask_indptr.data() + less_rows, - more_rows, - capacity_threshold * map_size, - n_chunks_per_row, - chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more(this->config.a_indptr, - more_rows, - mask_indptr.data() + less_rows, - capacity_threshold * map_size, - n_chunks_per_row.data(), - chunk_indices.data(), - this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row( + more_rows + 1, this->config.handle.get_stream()); + rmm::device_uvector chunk_indices( + 0, this->config.handle.get_stream()); + chunked_mask_row_it::init( + this->config.a_indptr, mask_indptr.data() + less_rows, more_rows, + capacity_threshold * map_size, n_chunks_per_row, chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more( + this->config.a_indptr, more_rows, mask_indptr.data() + less_rows, + capacity_threshold * map_size, n_chunks_per_row.data(), + chunk_indices.data(), this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base(*this, - map_size, - more, - out_dists, - coo_rows_b, - product_func, - accum_func, - write_func, - chunk_size, - n_more_blocks, - n_blocks_per_row); + this->_dispatch_base(*this, map_size, more, out_dists, coo_rows_b, + product_func, accum_func, write_func, chunk_size, + n_more_blocks, n_blocks_per_row); } } template - void dispatch_rev(value_t* out_dists, - value_idx* coo_rows_a, - product_f product_func, - accum_f accum_func, - write_f write_func, - int chunk_size) - { + void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, + product_f product_func, accum_f accum_func, + write_f write_func, int chunk_size) { auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr(this->config.b_nrows, - this->config.handle.get_stream()); + rmm::device_uvector mask_indptr( + this->config.b_nrows, this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.b_indptr, - this->config.b_nrows, - mask_indptr, - n_rows_divided, - this->config.handle.get_stream()); + chunking_needed(this->config.b_indptr, this->config.b_nrows, mask_indptr, + n_rows_divided, this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.b_indptr, less_rows, mask_indptr.data()); + mask_row_it less(this->config.b_indptr, less_rows, + mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base_rev(*this, - map_size, - less, - out_dists, - coo_rows_a, - product_func, - accum_func, - write_func, - chunk_size, - n_less_blocks, - n_blocks_per_row); + this->_dispatch_base_rev(*this, map_size, less, out_dists, coo_rows_a, + product_func, accum_func, write_func, chunk_size, + n_less_blocks, n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row(more_rows + 1, - this->config.handle.get_stream()); - rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); - chunked_mask_row_it::init(this->config.b_indptr, - mask_indptr.data() + less_rows, - more_rows, - capacity_threshold * map_size, - n_chunks_per_row, - chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more(this->config.b_indptr, - more_rows, - mask_indptr.data() + less_rows, - capacity_threshold * map_size, - n_chunks_per_row.data(), - chunk_indices.data(), - this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row( + more_rows + 1, this->config.handle.get_stream()); + rmm::device_uvector chunk_indices( + 0, this->config.handle.get_stream()); + chunked_mask_row_it::init( + this->config.b_indptr, mask_indptr.data() + less_rows, more_rows, + capacity_threshold * map_size, n_chunks_per_row, chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more( + this->config.b_indptr, more_rows, mask_indptr.data() + less_rows, + capacity_threshold * map_size, n_chunks_per_row.data(), + chunk_indices.data(), this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base_rev(*this, - map_size, - more, - out_dists, - coo_rows_a, - product_func, - accum_func, - write_func, - chunk_size, - n_more_blocks, - n_blocks_per_row); + this->_dispatch_base_rev(*this, map_size, more, out_dists, coo_rows_a, + product_func, accum_func, write_func, chunk_size, + n_more_blocks, n_blocks_per_row); } } - __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) - { + __device__ inline insert_type init_insert(smem_type cache, + const value_idx &cache_size) { return insert_type::make_from_uninitialized_slots( cooperative_groups::this_thread_block(), cache, cache_size, -1, 0); } - __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) - { + __device__ inline void insert(insert_type cache, const value_idx &key, + const value_t &value) { auto success = cache.insert(cuco::pair(key, value)); } - __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) - { + __device__ inline find_type init_find(smem_type cache, + const value_idx &cache_size) { return find_type(cache, cache_size, -1, 0); } - __device__ inline value_t find(find_type cache, const value_idx& key) - { + __device__ inline value_t find(find_type cache, const value_idx &key) { auto a_pair = cache.find(key); value_t a_col = 0.0; - if (a_pair != cache.end()) { a_col = a_pair->second; } + if (a_pair != cache.end()) { + a_col = a_pair->second; + } return a_col; } struct fits_in_hash_table { public: - fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_) - : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) - { - } + fits_in_hash_table(const value_idx *indptr_, value_idx degree_l_, + value_idx degree_r_) + : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) {} - __host__ __device__ bool operator()(const value_idx& i) - { + __host__ __device__ bool operator()(const value_idx &i) { auto degree = indptr[i + 1] - indptr[i]; return degree >= degree_l && degree < degree_r; } private: - const value_idx* indptr; + const value_idx *indptr; const value_idx degree_l, degree_r; }; - inline static int get_map_size() - { - return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / + inline static int get_map_size() { + return (raft::getSharedMemPerBlock() - + ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(typename insert_type::slot_type); } diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh index b12252ab25..51f9a05394 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh @@ -27,88 +27,68 @@ namespace sparse { namespace distance { /** - * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with - * sparse-matrix-sparse-vector multiplication layout (SPMV). - * This is intended to be scheduled n_chunks_b times for each row of a. - * The steps are as follows: - * - * 1. Load row from A into dense vector in shared memory. - * This can be further chunked in the future if necessary to support larger - * column sizes. - * 2. Threads of block all step through chunks of B in parallel. - * When a new row is encountered in row_indices_b, a segmented - * reduction is performed across the warps and then across the - * block and the final value written out to host memory. - * - * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf - * - * @tparam value_idx index type - * @tparam value_t value type - * @tparam tpb threads per block configured on launch - * @tparam rev if this is true, the reduce/accumulate functions are only - * executed when A[col] == 0.0. when executed before/after !rev - * and A & B are reversed, this allows the full symmetric difference - * and intersection to be computed. - * @tparam kv_t data type stored in shared mem cache - * @tparam product_f reduce function type (semiring product() function). - * accepts two arguments of value_t and returns a value_t - * @tparam accum_f accumulation function type (semiring sum() function). - * accepts two arguments of value_t and returns a value_t - * @tparam write_f function to write value out. this should be mathematically - * equivalent to the accumulate function but implemented as - * an atomic operation on global memory. Accepts two arguments - * of value_t* and value_t and updates the value given by the - * pointer. - * @param[in] indptrA column pointer array for A - * @param[in] indicesA column indices array for A - * @param[in] dataA data array for A - * @param[in] rowsB coo row array for B - * @param[in] indicesB column indices array for B - * @param[in] dataB data array for B - * @param[in] m number of rows in A - * @param[in] n number of rows in B - * @param[in] dim number of features - * @param[in] nnz_b number of nonzeros in B - * @param[out] out array of size m*n - * @param[in] n_blocks_per_row number of blocks of B per row of A - * @param[in] chunk_size number of nnz for B to use for each row of A - * @param[in] buffer_size amount of smem to use for each row of A - * @param[in] product_func semiring product() function - * @param[in] accum_func semiring sum() function - * @param[in] write_func atomic semiring sum() function - */ -template -__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, - indptr_it indptrA, - value_idx* indicesA, - value_t* dataA, - value_idx nnz_a, - value_idx* rowsB, - value_idx* indicesB, - value_t* dataB, - value_idx m, - value_idx n, - int dim, - value_idx nnz_b, - value_t* out, - int n_blocks_per_row, - int chunk_size, - value_idx b_ncols, - product_f product_func, - accum_f accum_func, - write_f write_func) -{ + * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with + * sparse-matrix-sparse-vector multiplication layout (SPMV). + * This is intended to be scheduled n_chunks_b times for each row of a. + * The steps are as follows: + * + * 1. Load row from A into dense vector in shared memory. + * This can be further chunked in the future if necessary to support larger + * column sizes. + * 2. Threads of block all step through chunks of B in parallel. + * When a new row is encountered in row_indices_b, a segmented + * reduction is performed across the warps and then across the + * block and the final value written out to host memory. + * + * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam tpb threads per block configured on launch + * @tparam rev if this is true, the reduce/accumulate functions are only + * executed when A[col] == 0.0. when executed before/after !rev + * and A & B are reversed, this allows the full symmetric difference + * and intersection to be computed. + * @tparam kv_t data type stored in shared mem cache + * @tparam product_f reduce function type (semiring product() function). + * accepts two arguments of value_t and returns a value_t + * @tparam accum_f accumulation function type (semiring sum() function). + * accepts two arguments of value_t and returns a value_t + * @tparam write_f function to write value out. this should be mathematically + * equivalent to the accumulate function but implemented as + * an atomic operation on global memory. Accepts two arguments + * of value_t* and value_t and updates the value given by the + * pointer. + * @param[in] indptrA column pointer array for A + * @param[in] indicesA column indices array for A + * @param[in] dataA data array for A + * @param[in] rowsB coo row array for B + * @param[in] indicesB column indices array for B + * @param[in] dataB data array for B + * @param[in] m number of rows in A + * @param[in] n number of rows in B + * @param[in] dim number of features + * @param[in] nnz_b number of nonzeros in B + * @param[out] out array of size m*n + * @param[in] n_blocks_per_row number of blocks of B per row of A + * @param[in] chunk_size number of nnz for B to use for each row of A + * @param[in] buffer_size amount of smem to use for each row of A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +__global__ void balanced_coo_generalized_spmv_kernel( + strategy_t strategy, indptr_it indptrA, value_idx *indicesA, value_t *dataA, + value_idx nnz_a, value_idx *rowsB, value_idx *indicesB, value_t *dataB, + value_idx m, value_idx n, int dim, value_idx nnz_b, value_t *out, + int n_blocks_per_row, int chunk_size, value_idx b_ncols, + product_f product_func, accum_f accum_func, write_f write_func) { typedef cub::WarpReduce warp_reduce; - value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); + value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row; // chunk starting offset @@ -116,17 +96,18 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, // how many total cols will be processed by this block (should be <= chunk_size * n_threads) value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset); - int tid = threadIdx.x; + int tid = threadIdx.x; int warp_id = tid / raft::warp_size(); // compute id relative to current warp unsigned int lane_id = tid & (raft::warp_size() - 1); - value_idx ind = ind_offset + threadIdx.x; + value_idx ind = ind_offset + threadIdx.x; extern __shared__ char smem[]; - typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); - typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim); + typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); + typename warp_reduce::TempStorage *temp_storage = + (typename warp_reduce::TempStorage *)(A + dim); auto inserter = strategy.init_insert(A, dim); @@ -134,12 +115,13 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, value_idx start_offset_a, stop_offset_a; bool first_a_chunk, last_a_chunk; - indptrA.get_row_offsets( - cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk); + indptrA.get_row_offsets(cur_row_a, start_offset_a, stop_offset_a, + n_blocks_per_row, first_a_chunk, last_a_chunk); // Convert current row vector in A to dense for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) { - strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]); + strategy.insert(inserter, indicesA[start_offset_a + i], + dataA[start_offset_a + i]); } __syncthreads(); @@ -150,36 +132,34 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, if (ind >= nnz_b) return; value_idx start_index_a = 0, stop_index_a = b_ncols - 1; - indptrA.get_indices_boundary(indicesA, - cur_row_a, - start_offset_a, - stop_offset_a, - start_index_a, - stop_index_a, - first_a_chunk, - last_a_chunk); + indptrA.get_indices_boundary(indicesA, cur_row_a, start_offset_a, + stop_offset_a, start_index_a, stop_index_a, + first_a_chunk, last_a_chunk); value_idx cur_row_b = -1; - value_t c = 0.0; + value_t c = 0.0; auto warp_red = warp_reduce(*(temp_storage + warp_id)); if (tid < active_chunk_size) { cur_row_b = rowsB[ind]; - auto index_b = indicesB[ind]; - auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = + indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); } + if (!rev || a_col == 0.0) { + c = product_func(a_col, dataB[ind]); + } } } // loop through chunks in parallel, reducing when a new row is // encountered by each thread for (int i = tid; i < active_chunk_size; i += blockDim.x) { - value_idx ind_next = ind + blockDim.x; + value_idx ind_next = ind + blockDim.x; value_idx next_row_b = -1; if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next]; @@ -190,13 +170,14 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, // grab the threads currently participating in loops. // because any other threads should have returned already. unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b); - bool is_leader = get_lowest_peer(peer_group) == lane_id; - value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); + bool is_leader = get_lowest_peer(peer_group) == lane_id; + value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); // thread with lowest lane id among peers writes out if (is_leader && v != 0.0) { // this conditional should be uniform, since rev is constant - size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a; + size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b + : (size_t)cur_row_b * m + cur_row_a; write_func(out + idx, v); } @@ -206,12 +187,15 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, if (next_row_b != -1) { ind = ind_next; - auto index_b = indicesB[ind]; - auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = + indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); } + if (!rev || a_col == 0.0) { + c = accum_func(c, product_func(a_col, dataB[ind])); + } } cur_row_b = next_row_b; diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh index 228a62ed7a..a1974b3666 100644 --- a/cpp/include/raft/sparse/distance/distance.cuh +++ b/cpp/include/raft/sparse/distance/distance.cuh @@ -74,17 +74,16 @@ static const std::unordered_set supportedDistance{ * @param[in] metric distance metric to use */ template -void pairwiseDistance(value_t* out, +void pairwiseDistance(value_t *out, distances_config_t input_config, - raft::distance::DistanceType metric, - float metric_arg) -{ + raft::distance::DistanceType metric, float metric_arg) { switch (metric) { case raft::distance::DistanceType::L2Expanded: l2_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtExpanded: - l2_sqrt_expanded_distances_t(input_config).compute(out); + l2_sqrt_expanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::InnerProduct: ip_distances_t(input_config).compute(out); @@ -93,49 +92,62 @@ void pairwiseDistance(value_t* out, l2_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - l2_sqrt_unexpanded_distances_t(input_config).compute(out); + l2_sqrt_unexpanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::L1: l1_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::LpUnexpanded: - lp_unexpanded_distances_t(input_config, metric_arg).compute(out); + lp_unexpanded_distances_t(input_config, metric_arg) + .compute(out); break; case raft::distance::DistanceType::Linf: - linf_unexpanded_distances_t(input_config).compute(out); + linf_unexpanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::Canberra: - canberra_unexpanded_distances_t(input_config).compute(out); + canberra_unexpanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::JaccardExpanded: - jaccard_expanded_distances_t(input_config).compute(out); + jaccard_expanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::CosineExpanded: - cosine_expanded_distances_t(input_config).compute(out); + cosine_expanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::HellingerExpanded: - hellinger_expanded_distances_t(input_config).compute(out); + hellinger_expanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::DiceExpanded: dice_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::CorrelationExpanded: - correlation_expanded_distances_t(input_config).compute(out); + correlation_expanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::RusselRaoExpanded: - russelrao_expanded_distances_t(input_config).compute(out); + russelrao_expanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::HammingUnexpanded: - hamming_unexpanded_distances_t(input_config).compute(out); + hamming_unexpanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::JensenShannon: - jensen_shannon_unexpanded_distances_t(input_config).compute(out); + jensen_shannon_unexpanded_distances_t(input_config) + .compute(out); break; case raft::distance::DistanceType::KLDivergence: - kl_divergence_unexpanded_distances_t(input_config).compute(out); + kl_divergence_unexpanded_distances_t(input_config) + .compute(out); break; - default: THROW("Unsupported distance: %d", metric); + default: + THROW("Unsupported distance: %d", metric); } } diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh index 8d77f9f5b5..882ccba027 100644 --- a/cpp/include/raft/sparse/distance/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/ip_distance.cuh @@ -45,13 +45,10 @@ class ip_distances_t : public distances_t { * Computes simple sparse inner product distances as sum(x_y * y_k) * @param[in] config specifies inputs, outputs, and sizes */ - ip_distances_t(const distances_config_t& config) - : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) - { - raft::sparse::convert::csr_to_coo(config_->b_indptr, - config_->b_nrows, - coo_rows_b.data(), - config_->b_nnz, + ip_distances_t(const distances_config_t &config) + : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) { + raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, + coo_rows_b.data(), config_->b_nnz, config_->handle.get_stream()); } @@ -59,21 +56,21 @@ class ip_distances_t : public distances_t { * Performs pairwise distance computation and computes output distances * @param out_distances dense output matrix (size a_nrows * b_nrows) */ - void compute(value_t* out_distances) - { + void compute(value_t *out_distances) { /** - * Compute pairwise distances and return dense matrix in row-major format - */ + * Compute pairwise distances and return dense matrix in row-major format + */ balanced_coo_pairwise_generalized_spmv( - out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd()); + out_distances, *config_, coo_rows_b.data(), Product(), Sum(), + AtomicAdd()); } - value_idx* b_rows_coo() { return coo_rows_b.data(); } + value_idx *b_rows_coo() { return coo_rows_b.data(); } - value_t* b_data_coo() { return config_->b_data; } + value_t *b_data_coo() { return config_->b_data; } private: - const distances_config_t* config_; + const distances_config_t *config_; rmm::device_uvector coo_rows_b; }; }; // END namespace distance diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh index a9a2d1ee91..8886d4c9df 100644 --- a/cpp/include/raft/sparse/distance/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/l2_distance.cuh @@ -41,36 +41,35 @@ namespace distance { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_row_norm_kernel(value_t* out, - const value_idx* __restrict__ coo_rows, - const value_t* __restrict__ data, - value_idx nnz) -{ +__global__ void compute_row_norm_kernel(value_t *out, + const value_idx *__restrict__ coo_rows, + const value_t *__restrict__ data, + value_idx nnz) { value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); } + if (i < nnz) { + atomicAdd(&out[coo_rows[i]], data[i] * data[i]); + } } template -__global__ void compute_row_sum_kernel(value_t* out, - const value_idx* __restrict__ coo_rows, - const value_t* __restrict__ data, - value_idx nnz) -{ +__global__ void compute_row_sum_kernel(value_t *out, + const value_idx *__restrict__ coo_rows, + const value_t *__restrict__ data, + value_idx nnz) { value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); } + if (i < nnz) { + atomicAdd(&out[coo_rows[i]], data[i]); + } } template -__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C, - const value_t* __restrict__ Q_sq_norms, - const value_t* __restrict__ R_sq_norms, - value_idx n_rows, - value_idx n_cols, - expansion_f expansion_func) -{ +__global__ void compute_euclidean_warp_kernel( + value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, + const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols, + expansion_f expansion_func) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; @@ -84,29 +83,25 @@ __global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C, } template -__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C, - const value_t* __restrict__ Q_sq_norms, - const value_t* __restrict__ R_sq_norms, - const value_t* __restrict__ Q_norms, - const value_t* __restrict__ R_norms, - value_idx n_rows, - value_idx n_cols, - value_idx n) -{ +__global__ void compute_correlation_warp_kernel( + value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, + const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms, + const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols, + value_idx n) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t dot = C[(size_t)i * n_cols + j]; + value_t dot = C[(size_t)i * n_cols + j]; value_t Q_l1 = Q_norms[i]; value_t R_l1 = R_norms[j]; value_t Q_l2 = Q_sq_norms[i]; value_t R_l2 = R_sq_norms[j]; - value_t numer = n * dot - (Q_l1 * R_l1); + value_t numer = n * dot - (Q_l1 * R_l1); value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1); value_t R_denom = n * R_l2 - (R_l1 * R_l1); @@ -116,77 +111,58 @@ __global__ void compute_correlation_warp_kernel(value_t* __restrict__ C, C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); } -template -void compute_euclidean(value_t* C, - const value_t* Q_sq_norms, - const value_t* R_sq_norms, - value_idx n_rows, - value_idx n_cols, - cudaStream_t stream, - expansion_f expansion_func) -{ +template +void compute_euclidean(value_t *C, const value_t *Q_sq_norms, + const value_t *R_sq_norms, value_idx n_rows, + value_idx n_cols, cudaStream_t stream, + expansion_f expansion_func) { int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_euclidean_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func); } -template -void compute_l2(value_t* out, - const value_idx* Q_coo_rows, - const value_t* Q_data, - value_idx Q_nnz, - const value_idx* R_coo_rows, - const value_t* R_data, - value_idx R_nnz, - value_idx m, - value_idx n, +template +void compute_l2(value_t *out, const value_idx *Q_coo_rows, + const value_t *Q_data, value_idx Q_nnz, + const value_idx *R_coo_rows, const value_t *R_data, + value_idx R_nnz, value_idx m, value_idx n, std::shared_ptr alloc, - cudaStream_t stream, - expansion_f expansion_func) -{ + cudaStream_t stream, expansion_f expansion_func) { rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); - CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_row_norm_kernel<<>>( R_sq_norms.data(), R_coo_rows, R_data, R_nnz); - compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func); + compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, + expansion_func); } template -void compute_correlation(value_t* C, - const value_t* Q_sq_norms, - const value_t* R_sq_norms, - const value_t* Q_norms, - const value_t* R_norms, - value_idx n_rows, - value_idx n_cols, - value_idx n, - cudaStream_t stream) -{ +void compute_correlation(value_t *C, const value_t *Q_sq_norms, + const value_t *R_sq_norms, const value_t *Q_norms, + const value_t *R_norms, value_idx n_rows, + value_idx n_cols, value_idx n, cudaStream_t stream) { int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_correlation_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n); } template -void compute_corr(value_t* out, - const value_idx* Q_coo_rows, - const value_t* Q_data, - value_idx Q_nnz, - const value_idx* R_coo_rows, - const value_t* R_data, - value_idx R_nnz, - value_idx m, - value_idx n, - value_idx n_cols, +void compute_corr(value_t *out, const value_idx *Q_coo_rows, + const value_t *Q_data, value_idx Q_nnz, + const value_idx *R_coo_rows, const value_t *R_data, + value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols, std::shared_ptr alloc, - cudaStream_t stream) -{ + cudaStream_t stream) { // sum_sq for std dev rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); @@ -195,11 +171,15 @@ void compute_corr(value_t* out, rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK( + cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); @@ -211,15 +191,8 @@ void compute_corr(value_t* out, compute_row_sum_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_correlation(out, - Q_sq_norms.data(), - R_sq_norms.data(), - Q_norms.data(), - R_norms.data(), - m, - n, - n_cols, - stream); + compute_correlation(out, Q_sq_norms.data(), R_sq_norms.data(), Q_norms.data(), + R_norms.data(), m, n, n_cols, stream); } /** @@ -229,45 +202,35 @@ void compute_corr(value_t* out, template class l2_expanded_distances_t : public distances_t { public: - explicit l2_expanded_distances_t(const distances_config_t& config) - : config_(&config), ip_dists(config) - { - } + explicit l2_expanded_distances_t( + const distances_config_t &config) + : config_(&config), ip_dists(config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { ip_dists.compute(out_dists); - value_idx* b_indices = ip_dists.b_rows_coo(); - value_t* b_data = ip_dists.b_data_coo(); + value_idx *b_indices = ip_dists.b_rows_coo(); + value_t *b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, - config_->a_nrows, - search_coo_rows.data(), - config_->a_nnz, + rmm::device_uvector search_coo_rows( + config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + search_coo_rows.data(), config_->a_nnz, config_->handle.get_stream()); - compute_l2(out_dists, - search_coo_rows.data(), - config_->a_data, - config_->a_nnz, - b_indices, - b_data, - config_->b_nnz, - config_->a_nrows, - config_->b_nrows, - config_->handle.get_device_allocator(), - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - return -2 * dot + q_norm + r_norm; - }); + compute_l2( + out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, + b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, + config_->handle.get_device_allocator(), config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + return -2 * dot + q_norm + r_norm; + }); } ~l2_expanded_distances_t() = default; protected: - const distances_config_t* config_; + const distances_config_t *config_; ip_distances_t ip_dists; }; @@ -276,21 +239,18 @@ class l2_expanded_distances_t : public distances_t { * The expanded form is more efficient for sparse data. */ template -class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t { +class l2_sqrt_expanded_distances_t + : public l2_expanded_distances_t { public: - explicit l2_sqrt_expanded_distances_t(const distances_config_t& config) - : l2_expanded_distances_t(config) - { - } + explicit l2_sqrt_expanded_distances_t( + const distances_config_t &config) + : l2_expanded_distances_t(config) {} - void compute(value_t* out_dists) override - { + void compute(value_t *out_dists) override { l2_expanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, - out_dists, - this->config_->a_nrows * this->config_->b_nrows, + out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -304,35 +264,25 @@ class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t class correlation_expanded_distances_t : public distances_t { public: - explicit correlation_expanded_distances_t(const distances_config_t& config) - : config_(&config), ip_dists(config) - { - } + explicit correlation_expanded_distances_t( + const distances_config_t &config) + : config_(&config), ip_dists(config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { ip_dists.compute(out_dists); - value_idx* b_indices = ip_dists.b_rows_coo(); - value_t* b_data = ip_dists.b_data_coo(); + value_idx *b_indices = ip_dists.b_rows_coo(); + value_t *b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, - config_->a_nrows, - search_coo_rows.data(), - config_->a_nnz, + rmm::device_uvector search_coo_rows( + config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + search_coo_rows.data(), config_->a_nnz, config_->handle.get_stream()); - compute_corr(out_dists, - search_coo_rows.data(), - config_->a_data, - config_->a_nnz, - b_indices, - b_data, - config_->b_nnz, - config_->a_nrows, - config_->b_nrows, - config_->b_ncols, + compute_corr(out_dists, search_coo_rows.data(), config_->a_data, + config_->a_nnz, b_indices, b_data, config_->b_nnz, + config_->a_nrows, config_->b_nrows, config_->b_ncols, config_->handle.get_device_allocator(), config_->handle.get_stream()); } @@ -340,62 +290,54 @@ class correlation_expanded_distances_t : public distances_t { ~correlation_expanded_distances_t() = default; protected: - const distances_config_t* config_; + const distances_config_t *config_; ip_distances_t ip_dists; }; /** - * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * - * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data. + * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2))) + * The expanded form is more efficient for sparse data. */ template class cosine_expanded_distances_t : public distances_t { public: - explicit cosine_expanded_distances_t(const distances_config_t& config) - : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) - { - } + explicit cosine_expanded_distances_t( + const distances_config_t &config) + : config_(&config), + workspace(0, config.handle.get_stream()), + ip_dists(config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { ip_dists.compute(out_dists); - value_idx* b_indices = ip_dists.b_rows_coo(); - value_t* b_data = ip_dists.b_data_coo(); + value_idx *b_indices = ip_dists.b_rows_coo(); + value_t *b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, - config_->a_nrows, - search_coo_rows.data(), - config_->a_nnz, + rmm::device_uvector search_coo_rows( + config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + search_coo_rows.data(), config_->a_nnz, config_->handle.get_stream()); - compute_l2(out_dists, - search_coo_rows.data(), - config_->a_data, - config_->a_nnz, - b_indices, - b_data, - config_->b_nnz, - config_->a_nrows, - config_->b_nrows, - config_->handle.get_device_allocator(), - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t norms = sqrt(q_norm) * sqrt(r_norm); - // deal with potential for 0 in denominator by forcing 0/1 instead - value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); - - // flip the similarity when both rows are 0 - bool both_empty = (q_norm == 0) && (r_norm == 0); - return 1 - ((!both_empty * cos) + both_empty); - }); + compute_l2( + out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, + b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, + config_->handle.get_device_allocator(), config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t norms = sqrt(q_norm) * sqrt(r_norm); + // deal with potential for 0 in denominator by forcing 0/1 instead + value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); + + // flip the similarity when both rows are 0 + bool both_empty = (q_norm == 0) && (r_norm == 0); + return 1 - ((!both_empty * cos) + both_empty); + }); } ~cosine_expanded_distances_t() = default; private: - const distances_config_t* config_; + const distances_config_t *config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -412,34 +354,25 @@ class cosine_expanded_distances_t : public distances_t { template class hellinger_expanded_distances_t : public distances_t { public: - explicit hellinger_expanded_distances_t(const distances_config_t& config) - : config_(&config), workspace(0, config.handle.get_stream()) - { - } + explicit hellinger_expanded_distances_t( + const distances_config_t &config) + : config_(&config), workspace(0, config.handle.get_stream()) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, - config_->b_nrows, - coo_rows.data(), - config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, + coo_rows.data(), config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, - *config_, - coo_rows.data(), - [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, - Sum(), + out_dists, *config_, coo_rows.data(), + [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, - out_dists, - config_->a_nrows * config_->b_nrows, + out_dists, out_dists, config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative bool rectifier = (1 - input) > 0; @@ -451,43 +384,42 @@ class hellinger_expanded_distances_t : public distances_t { ~hellinger_expanded_distances_t() = default; private: - const distances_config_t* config_; + const distances_config_t *config_; rmm::device_uvector workspace; }; template class russelrao_expanded_distances_t : public distances_t { public: - explicit russelrao_expanded_distances_t(const distances_config_t& config) - : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) - { - } + explicit russelrao_expanded_distances_t( + const distances_config_t &config) + : config_(&config), + workspace(0, config.handle.get_stream()), + ip_dists(config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { ip_dists.compute(out_dists); - value_t n_cols = config_->a_ncols; + value_t n_cols = config_->a_ncols; value_t n_cols_inv = 1.0 / n_cols; raft::linalg::unaryOp( - out_dists, - out_dists, - config_->a_nrows * config_->b_nrows, + out_dists, out_dists, config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; }, config_->handle.get_stream()); - auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); - auto diags = thrust::counting_iterator(0); + auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); + auto diags = thrust::counting_iterator(0); value_idx b_nrows = config_->b_nrows; - thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) { - out_dists[input * b_nrows + input] = 0.0; - }); + thrust::for_each(exec_policy, diags, diags + config_->a_nrows, + [=] __device__(value_idx input) { + out_dists[input * b_nrows + input] = 0.0; + }); } ~russelrao_expanded_distances_t() = default; private: - const distances_config_t* config_; + const distances_config_t *config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh index 7f9511ff03..885d55ee50 100644 --- a/cpp/include/raft/sparse/distance/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/lp_distance.cuh @@ -38,33 +38,23 @@ namespace raft { namespace sparse { namespace distance { -template -void unexpanded_lp_distances(value_t* out_dists, - const distances_config_t* config_, - product_f product_func, - accum_f accum_func, - write_f write_func) -{ +template +void unexpanded_lp_distances( + value_t *out_dists, const distances_config_t *config_, + product_f product_func, accum_f accum_func, write_f write_func) { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, - config_->b_nrows, - coo_rows.data(), - config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, + coo_rows.data(), config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func); - raft::sparse::convert::csr_to_coo(config_->a_indptr, - config_->a_nrows, - coo_rows.data(), - config_->a_nnz, + raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, + coo_rows.data(), config_->a_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv_rev( @@ -81,51 +71,48 @@ void unexpanded_lp_distances(value_t* out_dists, template class l1_unexpanded_distances_t : public distances_t { public: - l1_unexpanded_distances_t(const distances_config_t& config) : config_(&config) - { - } + l1_unexpanded_distances_t( + const distances_config_t &config) + : config_(&config) {} - void compute(value_t* out_dists) - { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), Sum(), AtomicAdd()); + void compute(value_t *out_dists) { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), + Sum(), AtomicAdd()); } private: - const distances_config_t* config_; + const distances_config_t *config_; }; template class l2_unexpanded_distances_t : public distances_t { public: - l2_unexpanded_distances_t(const distances_config_t& config) : config_(&config) - { - } + l2_unexpanded_distances_t( + const distances_config_t &config) + : config_(&config) {} - void compute(value_t* out_dists) - { - unexpanded_lp_distances(out_dists, config_, SqDiff(), Sum(), AtomicAdd()); + void compute(value_t *out_dists) { + unexpanded_lp_distances(out_dists, config_, SqDiff(), + Sum(), AtomicAdd()); } protected: - const distances_config_t* config_; + const distances_config_t *config_; }; template -class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t { +class l2_sqrt_unexpanded_distances_t + : public l2_unexpanded_distances_t { public: - l2_sqrt_unexpanded_distances_t(const distances_config_t& config) - : l2_unexpanded_distances_t(config) - { - } + l2_sqrt_unexpanded_distances_t( + const distances_config_t &config) + : l2_unexpanded_distances_t(config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { l2_unexpanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, - out_dists, - this->config_->a_nrows * this->config_->b_nrows, + out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -137,33 +124,29 @@ class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t class linf_unexpanded_distances_t : public distances_t { public: - explicit linf_unexpanded_distances_t(const distances_config_t& config) - : config_(&config) - { - } + explicit linf_unexpanded_distances_t( + const distances_config_t &config) + : config_(&config) {} - void compute(value_t* out_dists) - { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), Max(), AtomicMax()); + void compute(value_t *out_dists) { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), + Max(), AtomicMax()); } private: - const distances_config_t* config_; + const distances_config_t *config_; }; template class canberra_unexpanded_distances_t : public distances_t { public: - explicit canberra_unexpanded_distances_t(const distances_config_t& config) - : config_(&config) - { - } + explicit canberra_unexpanded_distances_t( + const distances_config_t &config) + : config_(&config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { unexpanded_lp_distances( - out_dists, - config_, + out_dists, config_, [] __device__(value_t a, value_t b) { value_t d = fabs(a) + fabs(b); @@ -171,82 +154,70 @@ class canberra_unexpanded_distances_t : public distances_t { // forcing 1/0 instead return ((d != 0) * fabs(a - b)) / (d + (d == 0)); }, - Sum(), - AtomicAdd()); + Sum(), AtomicAdd()); } private: - const distances_config_t* config_; + const distances_config_t *config_; }; template class lp_unexpanded_distances_t : public distances_t { public: - explicit lp_unexpanded_distances_t(const distances_config_t& config, - value_t p_) - : config_(&config), p(p_) - { - } + explicit lp_unexpanded_distances_t( + const distances_config_t &config, value_t p_) + : config_(&config), p(p_) {} - void compute(value_t* out_dists) - { - unexpanded_lp_distances(out_dists, config_, PDiff(p), Sum(), AtomicAdd()); + void compute(value_t *out_dists) { + unexpanded_lp_distances(out_dists, config_, PDiff(p), + Sum(), AtomicAdd()); float one_over_p = 1.0f / p; raft::linalg::unaryOp( - out_dists, - out_dists, - config_->a_nrows * config_->b_nrows, + out_dists, out_dists, config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return pow(input, one_over_p); }, config_->handle.get_stream()); } private: - const distances_config_t* config_; + const distances_config_t *config_; value_t p; }; template class hamming_unexpanded_distances_t : public distances_t { public: - explicit hamming_unexpanded_distances_t(const distances_config_t& config) - : config_(&config) - { - } + explicit hamming_unexpanded_distances_t( + const distances_config_t &config) + : config_(&config) {} - void compute(value_t* out_dists) - { - unexpanded_lp_distances(out_dists, config_, NotEqual(), Sum(), AtomicAdd()); + void compute(value_t *out_dists) { + unexpanded_lp_distances(out_dists, config_, NotEqual(), + Sum(), AtomicAdd()); value_t n_cols = 1.0 / config_->a_ncols; raft::linalg::unaryOp( - out_dists, - out_dists, - config_->a_nrows * config_->b_nrows, + out_dists, out_dists, config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return input * n_cols; }, config_->handle.get_stream()); } private: - const distances_config_t* config_; + const distances_config_t *config_; }; template class jensen_shannon_unexpanded_distances_t : public distances_t { public: explicit jensen_shannon_unexpanded_distances_t( - const distances_config_t& config) - : config_(&config) - { - } + const distances_config_t &config) + : config_(&config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { unexpanded_lp_distances( - out_dists, - config_, + out_dists, config_, [] __device__(value_t a, value_t b) { - value_t m = 0.5f * (a + b); + value_t m = 0.5f * (a + b); bool a_zero = a == 0; bool b_zero = b == 0; @@ -256,61 +227,49 @@ class jensen_shannon_unexpanded_distances_t : public distances_t { bool x_zero = x == 0; bool y_zero = y == 0; - return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero))); + return (-a * (!x_zero * log(x + x_zero))) + + (-b * (!y_zero * log(y + y_zero))); }, - Sum(), - AtomicAdd()); + Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, - out_dists, - config_->a_nrows * config_->b_nrows, + out_dists, out_dists, config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return sqrt(0.5 * input); }, config_->handle.get_stream()); } private: - const distances_config_t* config_; + const distances_config_t *config_; }; template class kl_divergence_unexpanded_distances_t : public distances_t { public: explicit kl_divergence_unexpanded_distances_t( - const distances_config_t& config) - : config_(&config) - { - } + const distances_config_t &config) + : config_(&config) {} - void compute(value_t* out_dists) - { + void compute(value_t *out_dists) { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, - config_->b_nrows, - coo_rows.data(), - config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, + coo_rows.data(), config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, - *config_, - coo_rows.data(), - [] __device__(value_t a, value_t b) { return a * log(a / b); }, - Sum(), + out_dists, *config_, coo_rows.data(), + [] __device__(value_t a, value_t b) { return a * log(a / b); }, Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, - out_dists, - config_->a_nrows * config_->b_nrows, + out_dists, out_dists, config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return 0.5 * input; }, config_->handle.get_stream()); } private: - const distances_config_t* config_; + const distances_config_t *config_; }; }; // END namespace distance diff --git a/cpp/include/raft/sparse/distance/operators.cuh b/cpp/include/raft/sparse/distance/operators.cuh index 3a9d0ba879..89acda8b1a 100644 --- a/cpp/include/raft/sparse/distance/operators.cuh +++ b/cpp/include/raft/sparse/distance/operators.cuh @@ -24,24 +24,21 @@ namespace distance { struct Sum { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) - { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { return a + b; } }; struct NotEqual { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) - { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { return a != b; } }; struct SqDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) - { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { return (a - b) * (a - b); } }; @@ -52,48 +49,44 @@ struct PDiff { PDiff(float p_) : p(p_) {} template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) - { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { return pow(a - b, p); } }; struct Max { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) - { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { return fmax(a, b); } }; struct AtomicAdd { template - __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) - { + __host__ __device__ __forceinline__ value_t operator()(value_t *a, + value_t b) { return atomicAdd(a, b); } }; struct AtomicMax { template - __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) - { + __host__ __device__ __forceinline__ value_t operator()(value_t *a, + value_t b) { return atomicMax(a, b); } }; struct Product { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) - { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { return a * b; } }; struct AbsDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) - { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { return fabs(a - b); } }; diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh index d78b927e46..6b6d77a2d5 100644 --- a/cpp/include/raft/sparse/distance/utils.cuh +++ b/cpp/include/raft/sparse/distance/utils.cuh @@ -34,10 +34,10 @@ namespace distance { * @return the maximum number of columns that can be stored in smem */ template -inline int max_cols_per_block() -{ +inline int max_cols_per_block() { // max cols = (total smem available - cub reduction smem) - return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / + return (raft::getSharedMemPerBlock() - + ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(value_t); } diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h index 1738dd7498..29f541498b 100644 --- a/cpp/include/raft/sparse/hierarchy/common.h +++ b/cpp/include/raft/sparse/hierarchy/common.h @@ -37,15 +37,13 @@ class linkage_output { value_idx n_leaves; value_idx n_connected_components; - value_idx* labels; // size: m + value_idx *labels; // size: m - value_idx* children; // size: (m-1, 2) + value_idx *children; // size: (m-1, 2) }; -class linkage_output_int_float : public linkage_output { -}; -class linkage_output__int64_float : public linkage_output { -}; +class linkage_output_int_float : public linkage_output {}; +class linkage_output__int64_float : public linkage_output {}; }; // namespace hierarchy }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 95df7f4642..1ac075489a 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -42,32 +42,31 @@ class UnionFind { value_idx n_indices; UnionFind(value_idx N_) - : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_) - { + : n_indices(2 * N_ - 1), + parent(2 * N_ - 1, -1), + size(2 * N_ - 1, 1), + next_label(N_) { memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx)); } - value_idx find(value_idx n) - { + value_idx find(value_idx n) { value_idx p; p = n; - while (parent[n] != -1) - n = parent[n]; + while (parent[n] != -1) n = parent[n]; // path compression while (parent[p] != n) { - p = parent[p == -1 ? n_indices - 1 : p]; + p = parent[p == -1 ? n_indices - 1 : p]; parent[p == -1 ? n_indices - 1 : p] = n; } return n; } - void perform_union(value_idx m, value_idx n) - { + void perform_union(value_idx m, value_idx n) { size[next_label] = size[m] + size[n]; - parent[m] = next_label; - parent[n] = next_label; + parent[m] = next_label; + parent[n] = next_label; next_label += 1; } @@ -96,17 +95,12 @@ class UnionFind { * @param[out] out_size cluster sizes of output */ template -void build_dendrogram_host(const handle_t& handle, - const value_idx* rows, - const value_idx* cols, - const value_t* data, - size_t nnz, - value_idx* children, - value_t* out_delta, - value_idx* out_size) -{ +void build_dendrogram_host(const handle_t &handle, const value_idx *rows, + const value_idx *cols, const value_t *data, + size_t nnz, value_idx *children, value_t *out_delta, + value_idx *out_size) { auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); value_idx n_edges = nnz; @@ -127,8 +121,8 @@ void build_dendrogram_host(const handle_t& handle, UnionFind U(nnz + 1); for (value_idx i = 0; i < nnz; i++) { - value_idx a = mst_src_h[i]; - value_idx b = mst_dst_h[i]; + value_idx a = mst_src_h[i]; + value_idx b = mst_dst_h[i]; value_t delta = mst_weights_h[i]; value_idx aa = U.find(a); @@ -136,10 +130,10 @@ void build_dendrogram_host(const handle_t& handle, value_idx children_idx = i * 2; - children_h[children_idx] = aa; + children_h[children_idx] = aa; children_h[children_idx + 1] = bb; - out_delta_h[i] = delta; - out_size_h[i] = U.size[aa] + U.size[bb]; + out_delta_h[i] = delta; + out_size_h[i] = U.size[aa] + U.size[bb]; U.perform_union(aa, bb); } @@ -150,15 +144,13 @@ void build_dendrogram_host(const handle_t& handle, } template -__global__ void write_levels_kernel(const value_idx* children, - value_idx* parents, - value_idx n_vertices) -{ +__global__ void write_levels_kernel(const value_idx *children, + value_idx *parents, value_idx n_vertices) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { value_idx level = tid / 2; value_idx child = children[tid]; - parents[child] = level; + parents[child] = level; } } @@ -174,17 +166,14 @@ __global__ void write_levels_kernel(const value_idx* children, * @param labels */ template -__global__ void inherit_labels(const value_idx* children, - const value_idx* levels, - size_t n_leaves, - value_idx* labels, - int cut_level, - value_idx n_vertices) -{ +__global__ void inherit_labels(const value_idx *children, + const value_idx *levels, size_t n_leaves, + value_idx *labels, int cut_level, + value_idx n_vertices) { value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { - value_idx node = children[tid]; + value_idx node = children[tid]; value_idx cur_level = tid / 2; /** @@ -194,12 +183,12 @@ __global__ void inherit_labels(const value_idx* children, if (cur_level > cut_level) return; value_idx cur_parent = node; - value_idx label = labels[cur_parent]; + value_idx label = labels[cur_parent]; while (label == -1) { cur_parent = cur_level + n_leaves; - cur_level = levels[cur_parent]; - label = labels[cur_parent]; + cur_level = levels[cur_parent]; + label = labels[cur_parent]; } labels[node] = label; @@ -208,16 +197,15 @@ __global__ void inherit_labels(const value_idx* children, template struct init_label_roots { - init_label_roots(value_idx* labels_) : labels(labels_) {} + init_label_roots(value_idx *labels_) : labels(labels_) {} template - __host__ __device__ void operator()(Tuple t) - { + __host__ __device__ void operator()(Tuple t) { labels[thrust::get<1>(t)] = thrust::get<0>(t); } private: - value_idx* labels; + value_idx *labels; }; /** @@ -233,14 +221,11 @@ struct init_label_roots { * @param n_leaves */ template -void extract_flattened_clusters(const raft::handle_t& handle, - value_idx* labels, - const value_idx* children, - size_t n_clusters, - size_t n_leaves) -{ - auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); +void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, + const value_idx *children, size_t n_clusters, + size_t n_leaves) { + auto d_alloc = handle.get_device_allocator(); + auto stream = handle.get_stream(); auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); // Handle special case where n_clusters == 1 @@ -258,8 +243,10 @@ void extract_flattened_clusters(const raft::handle_t& handle, size_t n_edges = (n_leaves - 1) * 2; - thrust::device_ptr d_ptr = thrust::device_pointer_cast(children); - value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; + thrust::device_ptr d_ptr = + thrust::device_pointer_cast(children); + value_idx n_vertices = + *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; // Prevent potential infinite loop from labeling disconnected // connectivities graph. @@ -270,7 +257,8 @@ void extract_flattened_clusters(const raft::handle_t& handle, rmm::device_uvector levels(n_vertices, stream); value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb); - write_levels_kernel<<>>(children, levels.data(), n_vertices); + write_levels_kernel<<>>(children, levels.data(), + n_vertices); /** * Step 1: Find label roots: * @@ -284,26 +272,27 @@ void extract_flattened_clusters(const raft::handle_t& handle, rmm::device_uvector label_roots(child_size, stream); value_idx children_cpy_start = n_edges - child_size; - raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream); + raft::copy_async(label_roots.data(), children + children_cpy_start, + child_size, stream); - thrust::sort(thrust_policy, - label_roots.data(), + thrust::sort(thrust_policy, label_roots.data(), label_roots.data() + (child_size), thrust::greater()); rmm::device_uvector tmp_labels(n_vertices, stream); // Init labels to -1 - thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1); + thrust::fill(thrust_policy, tmp_labels.data(), + tmp_labels.data() + n_vertices, -1); // Write labels for cluster roots to "labels" thrust::counting_iterator first(0); - auto z_iter = thrust::make_zip_iterator( - thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters))); + auto z_iter = thrust::make_zip_iterator(thrust::make_tuple( + first, label_roots.data() + (label_roots.size() - n_clusters))); - thrust::for_each( - thrust_policy, z_iter, z_iter + n_clusters, init_label_roots(tmp_labels.data())); + thrust::for_each(thrust_policy, z_iter, z_iter + n_clusters, + init_label_roots(tmp_labels.data())); /** * Step 2: Propagate labels by having children iterate through their parents @@ -313,8 +302,9 @@ void extract_flattened_clusters(const raft::handle_t& handle, */ value_idx cut_level = (n_edges / 2) - (n_clusters - 1); - inherit_labels<<>>( - children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices); + inherit_labels<<>>(children, levels.data(), + n_leaves, tmp_labels.data(), + cut_level, n_vertices); // copy tmp labels to actual labels raft::copy_async(labels, tmp_labels.data(), n_leaves, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 096f1c650f..7cf959dda6 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -37,17 +37,14 @@ namespace raft { namespace hierarchy { namespace detail { -template +template struct distance_graph_impl { - void run(const raft::handle_t& handle, - const value_t* X, - size_t m, - size_t n, + void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, raft::distance::DistanceType metric, - rmm::device_uvector& indptr, - rmm::device_uvector& indices, - rmm::device_uvector& data, - int c); + rmm::device_uvector &indptr, + rmm::device_uvector &indices, + rmm::device_uvector &data, int c); }; /** @@ -56,51 +53,50 @@ struct distance_graph_impl { * @tparam value_t */ template -struct distance_graph_impl { - void run(const raft::handle_t& handle, - const value_t* X, - size_t m, - size_t n, +struct distance_graph_impl { + void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, raft::distance::DistanceType metric, - rmm::device_uvector& indptr, - rmm::device_uvector& indices, - rmm::device_uvector& data, - int c) - { - auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + rmm::device_uvector &indptr, + rmm::device_uvector &indices, + rmm::device_uvector &data, int c) { + auto d_alloc = handle.get_device_allocator(); + auto stream = handle.get_stream(); auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); // Need to symmetrize knn into undirected graph raft::sparse::COO knn_graph_coo(d_alloc, stream); - raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); + raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, + c); indices.resize(knn_graph_coo.nnz, stream); data.resize(knn_graph_coo.nnz, stream); // self-loops get max distance - auto transform_in = thrust::make_zip_iterator( - thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); - - thrust::transform(exec_policy, - transform_in, - transform_in + knn_graph_coo.nnz, - knn_graph_coo.vals(), - [=] __device__(const thrust::tuple& tup) { - bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); - return (self_loop * std::numeric_limits::max()) + - (!self_loop * thrust::get<2>(tup)); - }); - - raft::sparse::convert::sorted_coo_to_csr( - knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, d_alloc, stream); + auto transform_in = thrust::make_zip_iterator(thrust::make_tuple( + knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); + + thrust::transform( + exec_policy, transform_in, transform_in + knn_graph_coo.nnz, + knn_graph_coo.vals(), + [=] __device__(const thrust::tuple &tup) { + bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); + return (self_loop * std::numeric_limits::max()) + + (!self_loop * thrust::get<2>(tup)); + }); + + raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), + knn_graph_coo.nnz, indptr.data(), + m + 1, d_alloc, stream); // TODO: Wouldn't need to copy here if we could compute knn // graph directly on the device uvectors // ref: https://github.com/rapidsai/raft/issues/227 - raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, stream); - raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, stream); + raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, + stream); + raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, + stream); } }; @@ -120,17 +116,13 @@ struct distance_graph_impl -void get_distance_graph(const raft::handle_t& handle, - const value_t* X, - size_t m, - size_t n, - raft::distance::DistanceType metric, - rmm::device_uvector& indptr, - rmm::device_uvector& indices, - rmm::device_uvector& data, - int c) -{ +template +void get_distance_graph(const raft::handle_t &handle, const value_t *X, + size_t m, size_t n, raft::distance::DistanceType metric, + rmm::device_uvector &indptr, + rmm::device_uvector &indices, + rmm::device_uvector &data, int c) { auto stream = handle.get_stream(); indptr.resize(m + 1, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index f939e87484..765a5ad77f 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -37,10 +37,9 @@ namespace hierarchy { namespace detail { template -void merge_msts(raft::Graph_COO& coo1, - raft::Graph_COO& coo2, - cudaStream_t stream) -{ +void merge_msts(raft::Graph_COO &coo1, + raft::Graph_COO &coo2, + cudaStream_t stream) { /** Add edges to existing mst **/ int final_nnz = coo2.n_edges + coo1.n_edges; @@ -51,9 +50,12 @@ void merge_msts(raft::Graph_COO& coo1, /** * Construct final edge list */ - raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream); - raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream); - raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream); + raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), + coo2.n_edges, stream); + raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), + coo2.n_edges, stream); + raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), + coo2.n_edges, stream); coo1.n_edges = final_nnz; } @@ -72,18 +74,14 @@ void merge_msts(raft::Graph_COO& coo1, * @return updated MST edge list */ template -void connect_knn_graph( - const raft::handle_t& handle, - const value_t* X, - raft::Graph_COO& msf, - size_t m, - size_t n, - value_idx* color, - red_op reduction_op, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) -{ +void connect_knn_graph(const raft::handle_t &handle, const value_t *X, + raft::Graph_COO &msf, + size_t m, size_t n, value_idx *color, + red_op reduction_op, + raft::distance::DistanceType metric = + raft::distance::DistanceType::L2SqrtExpanded) { auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); raft::sparse::COO connected_edges(d_alloc, stream); @@ -91,21 +89,15 @@ void connect_knn_graph( handle, connected_edges, X, color, m, n, reduction_op); rmm::device_uvector indptr2(m + 1, stream); - raft::sparse::convert::sorted_coo_to_csr( - connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(), + connected_edges.nnz, indptr2.data(), + m + 1, d_alloc, stream); // On the second call, we hand the MST the original colors // and the new set of edges and let it restart the optimization process - auto new_mst = raft::mst::mst(handle, - indptr2.data(), - connected_edges.cols(), - connected_edges.vals(), - m, - connected_edges.nnz, - color, - stream, - false, - false); + auto new_mst = raft::mst::mst( + handle, indptr2.data(), connected_edges.cols(), connected_edges.vals(), m, + connected_edges.nnz, color, stream, false, false); merge_msts(msf, new_mst, stream); } @@ -135,35 +127,29 @@ void connect_knn_graph( * argument is really just a safeguard against the potential for infinite loops. */ template -void build_sorted_mst( - const raft::handle_t& handle, - const value_t* X, - const value_idx* indptr, - const value_idx* indices, - const value_t* pw_dists, - size_t m, - size_t n, - value_idx* mst_src, - value_idx* mst_dst, - value_t* mst_weight, - value_idx* color, - size_t nnz, - red_op reduction_op, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded, - int max_iter = 10) -{ +void build_sorted_mst(const raft::handle_t &handle, const value_t *X, + const value_idx *indptr, const value_idx *indices, + const value_t *pw_dists, size_t m, size_t n, + value_idx *mst_src, value_idx *mst_dst, + value_t *mst_weight, value_idx *color, size_t nnz, + red_op reduction_op, + raft::distance::DistanceType metric = + raft::distance::DistanceType::L2SqrtExpanded, + int max_iter = 10) { auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // We want to have MST initialize colors on first call. auto mst_coo = raft::mst::mst( - handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true); + handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, + true); - int iters = 1; + int iters = 1; int n_components = linkage::get_n_components(color, m, d_alloc, stream); while (n_components > 1 && iters < max_iter) { - connect_knn_graph(handle, X, mst_coo, m, n, color, reduction_op); + connect_knn_graph(handle, X, mst_coo, m, n, color, + reduction_op); iters++; @@ -190,8 +176,9 @@ void build_sorted_mst( " or increase 'max_iter'", max_iter); - raft::sparse::op::coo_sort_by_weight( - mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream); + raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(), + mst_coo.weights.data(), mst_coo.n_edges, + stream); raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream); raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream); diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp index fe9538120f..01a033945c 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp @@ -44,26 +44,20 @@ static const size_t EMPTY = 0; * @param[in] n number of columns in X * @param[in] metric distance metrix to use when constructing connectivities graph * @param[out] out struct containing output dendrogram and cluster assignments - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect - control + * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control * of k. The algorithm will set `k = log(n) + c` * @param[in] n_clusters number of clusters to assign data samples */ -template -void single_linkage(const raft::handle_t& handle, - const value_t* X, - size_t m, - size_t n, - raft::distance::DistanceType metric, - linkage_output* out, - int c, - size_t n_clusters) -{ - ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points"); - - auto stream = handle.get_stream(); +void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, + size_t n, raft::distance::DistanceType metric, + linkage_output *out, int c, + size_t n_clusters) { + ASSERT(n_clusters <= m, + "n_clusters must be less than or equal to the number of data points"); + + auto stream = handle.get_stream(); auto d_alloc = handle.get_device_allocator(); rmm::device_uvector indptr(EMPTY, stream); @@ -85,20 +79,10 @@ void single_linkage(const raft::handle_t& handle, */ rmm::device_uvector color(m, stream); raft::linkage::FixConnectivitiesRedOp op(color.data(), m); - detail::build_sorted_mst(handle, - X, - indptr.data(), - indices.data(), - pw_dists.data(), - m, - n, - mst_rows.data(), - mst_cols.data(), - mst_data.data(), - color.data(), - indices.size(), - op, - metric); + detail::build_sorted_mst( + handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n, + mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(), + indices.size(), op, metric); pw_dists.release(); @@ -110,19 +94,15 @@ void single_linkage(const raft::handle_t& handle, rmm::device_uvector out_delta(n_edges, stream); rmm::device_uvector out_size(n_edges, stream); // Create dendrogram - detail::build_dendrogram_host(handle, - mst_rows.data(), - mst_cols.data(), - mst_data.data(), - n_edges, - out->children, - out_delta.data(), - out_size.data()); - detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m); - - out->m = m; - out->n_clusters = n_clusters; - out->n_leaves = m; + detail::build_dendrogram_host( + handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges, + out->children, out_delta.data(), out_size.data()); + detail::extract_flattened_clusters(handle, out->labels, out->children, + n_clusters, m); + + out->m = m; + out->n_clusters = n_clusters; + out->n_leaves = m; out->n_connected_components = 1; } diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 01735a102d..47b1ba6e41 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -40,47 +40,40 @@ namespace sparse { namespace linalg { template -__global__ void csr_add_calc_row_counts_kernel(const int* a_ind, - const int* a_indptr, - const T* a_val, - int nnz1, - const int* b_ind, - const int* b_indptr, - const T* b_val, - int nnz2, - int m, - int* out_rowcounts) -{ +__global__ void csr_add_calc_row_counts_kernel( + const int *a_ind, const int *a_indptr, const T *a_val, int nnz1, + const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m, + int *out_rowcounts) { // loop through columns in each set of rows and // calculate number of unique cols across both rows int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { int a_start_idx = a_ind[row]; - int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); + int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); /** - * Union of columns within each row of A and B so that we can scan through - * them, adding their values together. - */ + * Union of columns within each row of A and B so that we can scan through + * them, adding their values together. + */ int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx); - int* arr = new int[max_size]; + int *arr = new int[max_size]; int cur_arr_idx = 0; for (int j = a_start_idx; j < a_stop_idx; j++) { arr[cur_arr_idx] = a_indptr[j]; cur_arr_idx++; } - int arr_size = cur_arr_idx; + int arr_size = cur_arr_idx; int final_size = arr_size; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = 0; k < arr_size; k++) { if (arr[k] == cur_col) { found = true; @@ -88,7 +81,9 @@ __global__ void csr_add_calc_row_counts_kernel(const int* a_ind, } } - if (!found) { final_size++; } + if (!found) { + final_size++; + } } out_rowcounts[row] = final_size; @@ -99,19 +94,11 @@ __global__ void csr_add_calc_row_counts_kernel(const int* a_ind, } template -__global__ void csr_add_kernel(const int* a_ind, - const int* a_indptr, - const T* a_val, - int nnz1, - const int* b_ind, - const int* b_indptr, - const T* b_val, - int nnz2, - int m, - int* out_ind, - int* out_indptr, - T* out_val) -{ +__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, + const T *a_val, int nnz1, const int *b_ind, + const int *b_indptr, const T *b_val, int nnz2, + int m, int *out_ind, int *out_indptr, + T *out_val) { // 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -122,21 +109,21 @@ __global__ void csr_add_kernel(const int* a_ind, int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); int o_idx = out_ind[row]; int cur_o_idx = o_idx; for (int j = a_start_idx; j < a_stop_idx; j++) { out_indptr[cur_o_idx] = a_indptr[j]; - out_val[cur_o_idx] = a_val[j]; + out_val[cur_o_idx] = a_val[j]; cur_o_idx++; } int arr_size = cur_o_idx - o_idx; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = o_idx; k < o_idx + arr_size; k++) { // If we found a match, sum the two values if (out_indptr[k] == cur_col) { @@ -149,7 +136,7 @@ __global__ void csr_add_kernel(const int* a_ind, // if we didn't find a match, add the value for b if (!found) { out_indptr[o_idx + arr_size] = cur_col; - out_val[o_idx + arr_size] = b_val[j]; + out_val[o_idx + arr_size] = b_val[j]; arr_size++; } } @@ -173,36 +160,32 @@ __global__ void csr_add_kernel(const int* a_ind, * @param stream: cuda stream to use */ template -size_t csr_add_calc_inds(const int* a_ind, - const int* a_indptr, - const T* a_val, - int nnz1, - const int* b_ind, - const int* b_indptr, - const T* b_val, - int nnz2, - int m, - int* out_ind, +size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, + int nnz1, const int *b_ind, const int *b_indptr, + const T *b_val, int nnz2, int m, int *out_ind, std::shared_ptr d_alloc, - cudaStream_t stream) -{ + cudaStream_t stream) { dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); raft::mr::device::buffer row_counts(d_alloc, stream, m + 1); - CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); + CUDA_CHECK( + cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); - csr_add_calc_row_counts_kernel<<>>( - a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data()); + csr_add_calc_row_counts_kernel + <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, + b_val, nnz2, m, row_counts.data()); int cnnz = 0; raft::update_host(&cnnz, row_counts.data() + m, 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); - exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d); + thrust::device_ptr row_counts_d = + thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); + exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, + c_ind_d); return cnnz; } @@ -225,25 +208,16 @@ size_t csr_add_calc_inds(const int* a_ind, * @param stream: cuda stream to use */ template -void csr_add_finalize(const int* a_ind, - const int* a_indptr, - const T* a_val, - int nnz1, - const int* b_ind, - const int* b_indptr, - const T* b_val, - int nnz2, - int m, - int* c_ind, - int* c_indptr, - T* c_val, - cudaStream_t stream) -{ +void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val, + int nnz1, const int *b_ind, const int *b_indptr, + const T *b_val, int nnz2, int m, int *c_ind, + int *c_indptr, T *c_val, cudaStream_t stream) { dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_add_kernel<<>>( - a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val); + csr_add_kernel + <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, + b_val, nnz2, m, c_ind, c_indptr, c_val); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh index 77a9445ab1..9bd322c90a 100644 --- a/cpp/include/raft/sparse/linalg/degree.cuh +++ b/cpp/include/raft/sparse/linalg/degree.cuh @@ -44,10 +44,11 @@ namespace linalg { * @param results array to place results */ template -__global__ void coo_degree_kernel(const int* rows, int nnz, int* results) -{ +__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz) { raft::myAtomicAdd(results + rows[row], 1); } + if (row < nnz) { + raft::myAtomicAdd(results + rows[row], 1); + } } /** @@ -59,8 +60,7 @@ __global__ void coo_degree_kernel(const int* rows, int nnz, int* results) * @param stream: cuda stream to use */ template -void coo_degree(const int* rows, int nnz, int* results, cudaStream_t stream) -{ +void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) { dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -77,28 +77,31 @@ void coo_degree(const int* rows, int nnz, int* results, cudaStream_t stream) * @param stream: cuda stream to use */ template -void coo_degree(COO* in, int* results, cudaStream_t stream) -{ +void coo_degree(COO *in, int *results, cudaStream_t stream) { dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_kernel<<>>(in->rows(), in->nnz, results); + coo_degree_kernel + <<>>(in->rows(), in->nnz, results); CUDA_CHECK(cudaGetLastError()); } template -__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results) -{ +__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz, + int *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); } + if (row < nnz && vals[row] != 0.0) { + raft::myAtomicAdd(results + rows[row], 1); + } } template -__global__ void coo_degree_scalar_kernel( - const int* rows, const T* vals, int nnz, T scalar, int* results) -{ +__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, + int nnz, T scalar, int *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); } + if (row < nnz && vals[row] != scalar) { + raft::myAtomicAdd(results + rows[row], 1); + } } /** @@ -111,12 +114,12 @@ __global__ void coo_degree_scalar_kernel( * @param stream: cuda stream to use */ template -void coo_degree_scalar(COO* in, T scalar, int* results, cudaStream_t stream) -{ +void coo_degree_scalar(COO *in, T scalar, int *results, + cudaStream_t stream) { dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_scalar_kernel - <<>>(in->rows(), in->vals(), in->nnz, scalar, results); + coo_degree_scalar_kernel<<>>( + in->rows(), in->vals(), in->nnz, scalar, results); CUDA_CHECK(cudaGetLastError()); } @@ -132,9 +135,8 @@ void coo_degree_scalar(COO* in, T scalar, int* results, cudaStream_t stream) * @param stream: cuda stream to use */ template -void coo_degree_scalar( - const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0) -{ +void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, + int *results, cudaStream_t stream = 0) { dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); coo_degree_scalar_kernel @@ -152,11 +154,12 @@ void coo_degree_scalar( * @param stream: cuda stream to use */ template -void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream) -{ +void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, + cudaStream_t stream) { dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_nz_kernel<<>>(rows, vals, nnz, results); + coo_degree_nz_kernel + <<>>(rows, vals, nnz, results); } /** @@ -168,8 +171,7 @@ void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaSt * @param stream: cuda stream to use */ template -void coo_degree_nz(COO* in, int* results, cudaStream_t stream) -{ +void coo_degree_nz(COO *in, int *results, cudaStream_t stream) { dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh index 59dc5ff3e4..bfcd3fd592 100644 --- a/cpp/include/raft/sparse/linalg/norm.cuh +++ b/cpp/include/raft/sparse/linalg/norm.cuh @@ -41,12 +41,10 @@ __global__ void csr_row_normalize_l1_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int* ia, // csr row ex_scan (sorted by row) - const T* vals, - int nnz, // array of values and number of non-zeros - int m, // num rows in csr - T* result) -{ // output array + const int *ia, // csr row ex_scan (sorted by row) + const T *vals, int nnz, // array of values and number of non-zeros + int m, // num rows in csr + T *result) { // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -54,7 +52,7 @@ __global__ void csr_row_normalize_l1_kernel( // sum all vals_arr for row and divide each val by sum if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -67,7 +65,7 @@ __global__ void csr_row_normalize_l1_kernel( for (int j = start_idx; j < stop_idx; j++) { if (sum != 0.0) { - T val = vals[j]; + T val = vals[j]; result[j] = val / sum; } else { result[j] = 0.0; @@ -87,18 +85,18 @@ __global__ void csr_row_normalize_l1_kernel( * @param stream: cuda stream to use */ template -void csr_row_normalize_l1(const int* ia, // csr row ex_scan (sorted by row) - const T* vals, +void csr_row_normalize_l1(const int *ia, // csr row ex_scan (sorted by row) + const T *vals, int nnz, // array of values and number of non-zeros int m, // num rows in csr - T* result, - cudaStream_t stream) -{ // output array + T *result, + cudaStream_t stream) { // output array dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_l1_kernel<<>>(ia, vals, nnz, m, result); + csr_row_normalize_l1_kernel + <<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } @@ -107,12 +105,10 @@ __global__ void csr_row_normalize_max_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int* ia, // csr row ind array (sorted by row) - const T* vals, - int nnz, // array of values and number of non-zeros - int m, // num total rows in csr - T* result) -{ // output array + const int *ia, // csr row ind array (sorted by row) + const T *vals, int nnz, // array of values and number of non-zeros + int m, // num total rows in csr + T *result) { // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -120,7 +116,7 @@ __global__ void csr_row_normalize_max_kernel( // find max across columns and divide if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -134,7 +130,7 @@ __global__ void csr_row_normalize_max_kernel( // divide nonzeros in current row by max for (int j = start_idx; j < stop_idx; j++) { if (max != 0.0 && max > std::numeric_limits::min()) { - T val = vals[j]; + T val = vals[j]; result[j] = val / max; } else { result[j] = 0.0; @@ -155,17 +151,16 @@ __global__ void csr_row_normalize_max_kernel( */ template -void csr_row_normalize_max(const int* ia, // csr row ind array (sorted by row) - const T* vals, +void csr_row_normalize_max(const int *ia, // csr row ind array (sorted by row) + const T *vals, int nnz, // array of values and number of non-zeros int m, // num total rows in csr - T* result, - cudaStream_t stream) -{ + T *result, cudaStream_t stream) { dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_max_kernel<<>>(ia, vals, nnz, m, result); + csr_row_normalize_max_kernel + <<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index 3b609d994f..15302f3b74 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -31,23 +31,16 @@ namespace sparse { namespace spectral { template -void fit_embedding(const raft::handle_t& handle, - int* rows, - int* cols, - T* vals, - int nnz, - int n, - int n_components, - T* out, - unsigned long long seed = 1234567) -{ - auto stream = handle.get_stream(); +void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, + int nnz, int n, int n_components, T *out, + unsigned long long seed = 1234567) { + auto stream = handle.get_stream(); auto d_alloc = handle.get_device_allocator(); raft::mr::device::buffer src_offsets(d_alloc, stream, n + 1); raft::mr::device::buffer dst_cols(d_alloc, stream, nnz); raft::mr::device::buffer dst_vals(d_alloc, stream, nnz); - convert::coo_to_csr( - handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); + convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), + dst_cols.data(), dst_vals.data()); raft::mr::device::buffer eigVals(d_alloc, stream, n_components + 1); raft::mr::device::buffer eigVecs(d_alloc, stream, n * (n_components + 1)); @@ -61,53 +54,48 @@ void fit_embedding(const raft::handle_t& handle, using index_type = int; using value_type = T; - index_type* ro = src_offsets.data(); - index_type* ci = dst_cols.data(); - value_type* vs = dst_vals.data(); + index_type *ro = src_offsets.data(); + index_type *ci = dst_cols.data(); + value_type *vs = dst_vals.data(); - raft::matrix::sparse_matrix_t const r_csr_m{handle, ro, ci, vs, n, nnz}; + raft::matrix::sparse_matrix_t const r_csr_m{ + handle, ro, ci, vs, n, nnz}; - index_type neigvs = n_components + 1; - index_type maxiter = 4000; // default reset value (when set to 0); - value_type tol = 0.01; - index_type restart_iter = 15 + neigvs; // what cugraph is using - auto t_exe_p = thrust::cuda::par.on(stream); + index_type neigvs = n_components + 1; + index_type maxiter = 4000; //default reset value (when set to 0); + value_type tol = 0.01; + index_type restart_iter = 15 + neigvs; //what cugraph is using + auto t_exe_p = thrust::cuda::par.on(stream); using thrust_exe_policy_t = decltype(t_exe_p); - raft::eigen_solver_config_t cfg{neigvs, maxiter, restart_iter, tol}; + raft::eigen_solver_config_t cfg{neigvs, maxiter, + restart_iter, tol}; cfg.seed = seed; raft::lanczos_solver_t eig_solver{cfg}; - // cluster computation here is irrelevant, - // hence define a no-op such solver to - // feed partition(): + //cluster computation here is irrelevant, + //hence define a no-op such solver to + //feed partition(): // struct no_op_cluster_solver_t { using index_type_t = index_type; - using size_type_t = index_type; + using size_type_t = index_type; using value_type_t = value_type; - std::pair solve(handle_t const& handle, - thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, - size_type_t dim, - value_type_t const* __restrict__ obs, - index_type_t* __restrict__ codes) const - { + std::pair solve( + handle_t const &handle, thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, size_type_t dim, + value_type_t const *__restrict__ obs, + index_type_t *__restrict__ codes) const { return std::make_pair(0, 0); } }; - raft::spectral::partition(handle, - t_exe_p, - r_csr_m, - eig_solver, - no_op_cluster_solver_t{}, - labels.data(), - eigVals.data(), - eigVecs.data()); + raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver, + no_op_cluster_solver_t{}, labels.data(), + eigVals.data(), eigVecs.data()); raft::copy(out, eigVecs.data() + n, n * n_components, stream); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index b9426c284a..5c2c78f0c3 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -49,34 +49,26 @@ namespace linalg { // TODO: value_idx param needs to be used for this once FAISS is updated to use float32 // for indices so that the index types can be uniform template -__global__ void coo_symmetrize_kernel(int* row_ind, - int* rows, - int* cols, - T* vals, - int* orows, - int* ocols, - T* ovals, - int n, - int cnnz, - Lambda reduction_op) -{ +__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, + T *vals, int *orows, int *ocols, T *ovals, + int n, int cnnz, Lambda reduction_op) { int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < n) { int start_idx = row_ind[row]; // each thread processes one row - int stop_idx = get_stop_idx(row, n, cnnz, row_ind); + int stop_idx = get_stop_idx(row, n, cnnz, row_ind); - int row_nnz = 0; + int row_nnz = 0; int out_start_idx = start_idx * 2; for (int idx = 0; idx < stop_idx - start_idx; idx++) { int cur_row = rows[idx + start_idx]; int cur_col = cols[idx + start_idx]; - T cur_val = vals[idx + start_idx]; + T cur_val = vals[idx + start_idx]; int lookup_row = cur_col; - int t_start = row_ind[lookup_row]; // Start at - int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); + int t_start = row_ind[lookup_row]; // Start at + int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); T transpose = 0.0; @@ -87,7 +79,7 @@ __global__ void coo_symmetrize_kernel(int* row_ind, // done in a different thread. if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) { // If it exists already, set transposed value to existing value - transpose = vals[t_idx]; + transpose = vals[t_idx]; found_match = true; break; } @@ -134,12 +126,10 @@ __global__ void coo_symmetrize_kernel(int* row_ind, * @param stream: cuda stream to use */ template -void coo_symmetrize(COO* in, - COO* out, +void coo_symmetrize(COO *in, COO *out, Lambda reduction_op, // two-argument reducer std::shared_ptr d_alloc, - cudaStream_t stream) -{ + cudaStream_t stream) { dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); @@ -151,16 +141,9 @@ void coo_symmetrize(COO* in, out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream); - coo_symmetrize_kernel<<>>(in_row_ind.data(), - in->rows(), - in->cols(), - in->vals(), - out->rows(), - out->cols(), - out->vals(), - in->n_rows, - in->nnz, - reduction_op); + coo_symmetrize_kernel<<>>( + in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(), + out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -176,15 +159,14 @@ void coo_symmetrize(COO* in, * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction */ template -__global__ static void symmetric_find_size(const value_t* restrict data, - const value_idx* restrict indices, - const value_idx n, - const int k, - value_idx* restrict row_sizes, - value_idx* restrict row_sizes2) -{ +__global__ static void symmetric_find_size(const value_t *restrict data, + const value_idx *restrict indices, + const value_idx n, const int k, + value_idx *restrict row_sizes, + value_idx *restrict row_sizes2) { const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = + blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; const auto col = indices[row * k + j]; @@ -204,11 +186,9 @@ __global__ static void symmetric_find_size(const value_t* restrict data, * @param row_sizes2: Input row sum 2 array(n) for faster reduction */ template -__global__ static void reduce_find_size(const value_idx n, - const int k, - value_idx* restrict row_sizes, - const value_idx* restrict row_sizes2) -{ +__global__ static void reduce_find_size(const value_idx n, const int k, + value_idx *restrict row_sizes, + const value_idx *restrict row_sizes2) { const auto i = (blockIdx.x * blockDim.x) + threadIdx.x; if (i >= n) return; row_sizes[i] += (row_sizes2[i] + k); @@ -229,21 +209,20 @@ __global__ static void reduce_find_size(const value_idx n, * @param k: Number of n_neighbors */ template -__global__ static void symmetric_sum(value_idx* restrict edges, - const value_t* restrict data, - const value_idx* restrict indices, - value_t* restrict VAL, - value_idx* restrict COL, - value_idx* restrict ROW, - const value_idx n, - const int k) -{ +__global__ static void symmetric_sum(value_idx *restrict edges, + const value_t *restrict data, + const value_idx *restrict indices, + value_t *restrict VAL, + value_idx *restrict COL, + value_idx *restrict ROW, const value_idx n, + const int k) { const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = + blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; - const auto col = indices[row * k + j]; - const auto original = atomicAdd(&edges[row], value_idx(1)); + const auto col = indices[row * k + j]; + const auto original = atomicAdd(&edges[row], value_idx(1)); const auto transpose = atomicAdd(&edges[col], value_idx(1)); VAL[transpose] = VAL[original] = data[row * k + j]; @@ -273,26 +252,26 @@ __global__ static void symmetric_sum(value_idx* restrict edges, * @param stream: Input cuda stream * @param d_alloc device allocator for temporary buffers */ -template -void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices, - const value_t* restrict knn_dists, - const value_idx n, - const int k, - COO* out, - cudaStream_t stream, - std::shared_ptr d_alloc) -{ +template +void from_knn_symmetrize_matrix( + const value_idx *restrict knn_indices, const value_t *restrict knn_dists, + const value_idx n, const int k, COO *out, + cudaStream_t stream, std::shared_ptr d_alloc) { // (1) Find how much space needed in each row // We look through all datapoints and increment the count for each row. const dim3 threadsPerBlock(TPB_X, TPB_Y); - const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), raft::ceildiv(k, TPB_Y)); + const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), + raft::ceildiv(k, TPB_Y)); // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) raft::mr::device::buffer row_sizes(d_alloc, stream, n); - CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK( + cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); raft::mr::device::buffer row_sizes2(d_alloc, stream, n); - CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK( + cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); symmetric_find_size<<>>( knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data()); @@ -313,12 +292,14 @@ void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices, // This mirrors CSR matrix's row Pointer, were maximum bounds for each row // are calculated as the cumulative rolling sum of the previous rows. // Notice reusing old row_sizes2 memory - value_idx* edges = row_sizes2.data(); - thrust::device_ptr __edges = thrust::device_pointer_cast(edges); - thrust::device_ptr __row_sizes = thrust::device_pointer_cast(row_sizes.data()); + value_idx *edges = row_sizes2.data(); + thrust::device_ptr __edges = thrust::device_pointer_cast(edges); + thrust::device_ptr __row_sizes = + thrust::device_pointer_cast(row_sizes.data()); // Rolling cumulative sum - thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, __row_sizes + n, __edges); + thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, + __row_sizes + n, __edges); // (5) Perform final data + data.T operation in tandem with memcpying symmetric_sum<<>>( @@ -330,17 +311,11 @@ void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices, * Symmetrizes a COO matrix */ template -void symmetrize(const raft::handle_t& handle, - const value_idx* rows, - const value_idx* cols, - const value_t* vals, - size_t m, - size_t n, - size_t nnz, - raft::sparse::COO& out) -{ +void symmetrize(const raft::handle_t &handle, const value_idx *rows, + const value_idx *cols, const value_t *vals, size_t m, size_t n, + size_t nnz, raft::sparse::COO &out) { auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // copy rows to cols and cols to rows rmm::device_uvector symm_rows(nnz * 2, stream); @@ -356,17 +331,13 @@ void symmetrize(const raft::handle_t& handle, raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream); // sort COO - raft::sparse::op::coo_sort((value_idx)m, - (value_idx)n, - (value_idx)nnz * 2, - symm_rows.data(), - symm_cols.data(), - symm_vals.data(), - d_alloc, - stream); - - raft::sparse::op::max_duplicates( - handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n); + raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2, + symm_rows.data(), symm_cols.data(), + symm_vals.data(), d_alloc, stream); + + raft::sparse::op::max_duplicates(handle, out, symm_rows.data(), + symm_cols.data(), symm_vals.data(), nnz * 2, + m, n); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h index ce90eb6702..6afe4ca8f6 100644 --- a/cpp/include/raft/sparse/linalg/transpose.h +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -57,55 +57,29 @@ namespace linalg { * @param[in] stream : Cuda stream for ordering events */ template -void csr_transpose(cusparseHandle_t handle, - const value_idx* csr_indptr, - const value_idx* csr_indices, - const value_t* csr_data, - value_idx* csc_indptr, - value_idx* csc_indices, - value_t* csc_data, - value_idx csr_nrows, - value_idx csr_ncols, +void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, + const value_idx *csr_indices, const value_t *csr_data, + value_idx *csc_indptr, value_idx *csc_indices, + value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols, value_idx nnz, std::shared_ptr allocator, - cudaStream_t stream) -{ + cudaStream_t stream) { size_t convert_csc_workspace_size = 0; - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle, - csr_nrows, - csr_ncols, - nnz, - csr_data, - csr_indptr, - csr_indices, - csc_data, - csc_indptr, - csc_indices, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - &convert_csc_workspace_size, - stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize( + handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, + csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, + &convert_csc_workspace_size, stream)); raft::mr::device::buffer convert_csc_workspace( allocator, stream, convert_csc_workspace_size); - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle, - csr_nrows, - csr_ncols, - nnz, - csr_data, - csr_indptr, - csr_indices, - csc_data, - csc_indptr, - csc_indices, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - convert_csc_workspace.data(), - stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc( + handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, + csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, + convert_csc_workspace.data(), stream)); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh index 36d426029b..f0d30b0cb7 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh @@ -28,16 +28,10 @@ namespace mst { namespace detail { template -__global__ void kernel_min_edge_per_vertex(const edge_t* offsets, - const vertex_t* indices, - const alteration_t* weights, - const vertex_t* color, - const vertex_t* color_index, - edge_t* new_mst_edge, - const bool* mst_edge, - alteration_t* min_edge_color, - const vertex_t v) -{ +__global__ void kernel_min_edge_per_vertex( + const edge_t* offsets, const vertex_t* indices, const alteration_t* weights, + const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, + const bool* mst_edge, alteration_t* min_edge_color, const vertex_t v) { edge_t tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned warp_id = tid / 32; @@ -47,14 +41,14 @@ __global__ void kernel_min_edge_per_vertex(const edge_t* offsets, __shared__ alteration_t min_edge_weight[32]; __shared__ vertex_t min_color[32]; - min_edge_index[lane_id] = std::numeric_limits::max(); + min_edge_index[lane_id] = std::numeric_limits::max(); min_edge_weight[lane_id] = std::numeric_limits::max(); - min_color[lane_id] = std::numeric_limits::max(); + min_color[lane_id] = std::numeric_limits::max(); __syncthreads(); vertex_t self_color_idx = color_index[warp_id]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // find the minimum edge associated per row // each thread in warp holds the minimum edge for @@ -62,20 +56,20 @@ __global__ void kernel_min_edge_per_vertex(const edge_t* offsets, if (warp_id < v) { // one row is associated with one warp edge_t row_start = offsets[warp_id]; - edge_t row_end = offsets[warp_id + 1]; + edge_t row_end = offsets[warp_id + 1]; // assuming one warp per row // find min for each thread in warp for (edge_t e = row_start + lane_id; e < row_end; e += 32) { alteration_t curr_edge_weight = weights[e]; - vertex_t successor_color_idx = color_index[indices[e]]; - vertex_t successor_color = color[successor_color_idx]; + vertex_t successor_color_idx = color_index[indices[e]]; + vertex_t successor_color = color[successor_color_idx]; if (!mst_edge[e] && self_color != successor_color) { if (curr_edge_weight < min_edge_weight[lane_id]) { - min_color[lane_id] = successor_color; + min_color[lane_id] = successor_color; min_edge_weight[lane_id] = curr_edge_weight; - min_edge_index[lane_id] = e; + min_edge_index[lane_id] = e; } } } @@ -88,9 +82,9 @@ __global__ void kernel_min_edge_per_vertex(const edge_t* offsets, for (int offset = 16; offset > 0; offset >>= 1) { if (lane_id < offset) { if (min_edge_weight[lane_id] > min_edge_weight[lane_id + offset]) { - min_color[lane_id] = min_color[lane_id + offset]; + min_color[lane_id] = min_color[lane_id + offset]; min_edge_weight[lane_id] = min_edge_weight[lane_id + offset]; - min_edge_index[lane_id] = min_edge_index[lane_id + offset]; + min_edge_index[lane_id] = min_edge_index[lane_id + offset]; } } __syncthreads(); @@ -108,26 +102,19 @@ __global__ void kernel_min_edge_per_vertex(const edge_t* offsets, } } -template -__global__ void min_edge_per_supervertex(const vertex_t* color, - const vertex_t* color_index, - edge_t* new_mst_edge, - bool* mst_edge, - const vertex_t* indices, - const weight_t* weights, - const alteration_t* altered_weights, - vertex_t* temp_src, - vertex_t* temp_dst, - weight_t* temp_weights, - const alteration_t* min_edge_color, - const vertex_t v, - bool symmetrize_output) -{ +template +__global__ void min_edge_per_supervertex( + const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, + bool* mst_edge, const vertex_t* indices, const weight_t* weights, + const alteration_t* altered_weights, vertex_t* temp_src, vertex_t* temp_dst, + weight_t* temp_weights, const alteration_t* min_edge_color, const vertex_t v, + bool symmetrize_output) { auto tid = get_1D_idx(); if (tid < v) { vertex_t vertex_color_idx = color_index[tid]; - vertex_t vertex_color = color[vertex_color_idx]; - edge_t edge_idx = new_mst_edge[tid]; + vertex_t vertex_color = color[vertex_color_idx]; + edge_t edge_idx = new_mst_edge[tid]; // check if valid outgoing edge was found // find minimum edge is same as minimum edge of whole supervertex @@ -142,27 +129,32 @@ __global__ void min_edge_per_supervertex(const vertex_t* color, auto dst = indices[edge_idx]; if (!symmetrize_output) { auto dst_edge_idx = new_mst_edge[dst]; - auto dst_color = color[color_index[dst]]; + auto dst_color = color[color_index[dst]]; // vertices added each other // only if destination has found an edge // the edge points back to source // the edge is minimum edge found for dst color - if (dst_edge_idx != std::numeric_limits::max() && indices[dst_edge_idx] == tid && + if (dst_edge_idx != std::numeric_limits::max() && + indices[dst_edge_idx] == tid && min_edge_color[dst_color] == altered_weights[dst_edge_idx]) { - if (vertex_color > dst_color) { add_edge = false; } + if (vertex_color > dst_color) { + add_edge = false; + } } } if (add_edge) { - temp_src[tid] = tid; - temp_dst[tid] = dst; - temp_weights[tid] = weights[edge_idx]; + temp_src[tid] = tid; + temp_dst[tid] = dst; + temp_weights[tid] = weights[edge_idx]; mst_edge[edge_idx] = true; } } - if (!add_edge) { new_mst_edge[tid] = std::numeric_limits::max(); } + if (!add_edge) { + new_mst_edge[tid] = std::numeric_limits::max(); + } } } } @@ -170,13 +162,9 @@ __global__ void min_edge_per_supervertex(const vertex_t* color, template __global__ void add_reverse_edge(const edge_t* new_mst_edge, const vertex_t* indices, - const weight_t* weights, - vertex_t* temp_src, - vertex_t* temp_dst, - weight_t* temp_weights, - const vertex_t v, - bool symmetrize_output) -{ + const weight_t* weights, vertex_t* temp_src, + vertex_t* temp_dst, weight_t* temp_weights, + const vertex_t v, bool symmetrize_output) { auto tid = get_1D_idx(); if (tid < v) { @@ -198,7 +186,9 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // if vertices did not pick each other // add a reverse edge - if (tid != neighbor_vertex_neighbor) { reverse_needed = true; } + if (tid != neighbor_vertex_neighbor) { + reverse_needed = true; + } } } @@ -207,8 +197,8 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // it is assumed the each vertex only picks one valid min edge // per cycle // hence, we store at index tid + v for the reverse edge scenario - temp_src[tid + v] = neighbor_vertex; - temp_dst[tid + v] = tid; + temp_src[tid + v] = neighbor_vertex; + temp_dst[tid + v] = tid; temp_weights[tid + v] = weights[edge_idx]; } } @@ -217,13 +207,11 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // executes for newly added mst edges and updates the colors of both vertices to the lower color template -__global__ void min_pair_colors(const vertex_t v, - const vertex_t* indices, +__global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, const edge_t* new_mst_edge, const vertex_t* color, const vertex_t* color_index, - vertex_t* next_color) -{ + vertex_t* next_color) { auto i = get_1D_idx(); if (i < v) { @@ -232,9 +220,9 @@ __global__ void min_pair_colors(const vertex_t v, if (edge_idx != std::numeric_limits::max()) { vertex_t neighbor_vertex = indices[edge_idx]; // vertex_t self_color = color[i]; - vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; - vertex_t neighbor_color_idx = color_index[neighbor_vertex]; + vertex_t self_color_idx = color_index[i]; + vertex_t self_color = color[self_color_idx]; + vertex_t neighbor_color_idx = color_index[neighbor_vertex]; vertex_t neighbor_super_color = color[neighbor_color_idx]; // update my own color as source of edge @@ -250,36 +238,33 @@ __global__ void min_pair_colors(const vertex_t v, // for each vertex, update color if it was changed in min_pair_colors kernel template -__global__ void update_colors(const vertex_t v, - vertex_t* color, +__global__ void update_colors(const vertex_t v, vertex_t* color, const vertex_t* color_index, - const vertex_t* next_color, - bool* done) -{ + const vertex_t* next_color, bool* done) { auto i = get_1D_idx(); if (i < v) { - vertex_t self_color = color[i]; + vertex_t self_color = color[i]; vertex_t self_color_idx = color_index[i]; - vertex_t new_color = next_color[self_color_idx]; + vertex_t new_color = next_color[self_color_idx]; // update self color to new smaller color if (self_color > new_color) { color[i] = new_color; - *done = false; + *done = false; } } } // point vertices to their final color index template -__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index) -{ +__global__ void final_color_indices(const vertex_t v, const vertex_t* color, + vertex_t* color_index) { auto i = get_1D_idx(); if (i < v) { vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // if self color is not equal to self color index, // it means self is not supervertex @@ -287,7 +272,7 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, ver // parent supervertex while (self_color_idx != self_color) { self_color_idx = color_index[self_color]; - self_color = color[self_color_idx]; + self_color = color[self_color_idx]; } // point to new supervertex @@ -297,23 +282,22 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, ver // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu // Consider using curand device API instead of precomputed random_values array -template -__global__ void alteration_kernel(const vertex_t v, - const edge_t e, +template +__global__ void alteration_kernel(const vertex_t v, const edge_t e, const edge_t* offsets, const vertex_t* indices, - const weight_t* weights, - alteration_t max, + const weight_t* weights, alteration_t max, alteration_t* random_values, - alteration_t* altered_weights) -{ + alteration_t* altered_weights) { auto row = get_1D_idx(); if (row < v) { auto row_begin = offsets[row]; - auto row_end = offsets[row + 1]; + auto row_end = offsets[row + 1]; for (auto i = row_begin; i < row_end; i++) { - auto column = indices[i]; - altered_weights[i] = weights[i] + max * (random_values[row] + random_values[column]); + auto column = indices[i]; + altered_weights[i] = + weights[i] + max * (random_values[row] + random_values[column]); } } } @@ -321,15 +305,17 @@ __global__ void alteration_kernel(const vertex_t v, template __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src, edge_t* mst_edge_count, - const vertex_t v) -{ + const vertex_t v) { auto tid = get_1D_idx(); // count number of new mst edges added - bool predicate = tid < v && (mst_src[tid] != std::numeric_limits::max()); + bool predicate = + tid < v && (mst_src[tid] != std::numeric_limits::max()); vertex_t block_count = __syncthreads_count(predicate); - if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); } + if (threadIdx.x == 0 && block_count > 0) { + atomicAdd(mst_edge_count, block_count); + } } } // namespace detail diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index 158f4cc314..c5ba4fcb4f 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -46,30 +46,21 @@ typedef std::chrono::high_resolution_clock Clock; // curand generator uniform inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - float* outputPtr, - size_t n) -{ + float* outputPtr, size_t n) { return curandGenerateUniform(generator, outputPtr, n); } inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - double* outputPtr, - size_t n) -{ + double* outputPtr, size_t n) { return curandGenerateUniformDouble(generator, outputPtr, n); } -template -MST_solver::MST_solver(const raft::handle_t& handle_, - const edge_t* offsets_, - const vertex_t* indices_, - const weight_t* weights_, - const vertex_t v_, - const edge_t e_, - vertex_t* color_, - cudaStream_t stream_, - bool symmetrize_output_, - bool initialize_colors_, - int iterations_) +template +MST_solver::MST_solver( + const raft::handle_t& handle_, const edge_t* offsets_, + const vertex_t* indices_, const weight_t* weights_, const vertex_t v_, + const edge_t e_, vertex_t* color_, cudaStream_t stream_, + bool symmetrize_output_, bool initialize_colors_, int iterations_) : handle(handle_), offsets(offsets_), indices(indices_), @@ -91,13 +82,12 @@ MST_solver::MST_solver(const raft::han stream(stream_), symmetrize_output(symmetrize_output_), initialize_colors(initialize_colors_), - iterations(iterations_) -{ - max_blocks = handle_.get_device_properties().maxGridSize[0]; + iterations(iterations_) { + max_blocks = handle_.get_device_properties().maxGridSize[0]; max_threads = handle_.get_device_properties().maxThreadsPerBlock; - sm_count = handle_.get_device_properties().multiProcessorCount; + sm_count = handle_.get_device_properties().multiProcessorCount; - // Initially, color holds the vertex id as color + //Initially, color holds the vertex id as color auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); if (initialize_colors_) { thrust::sequence(policy, color.begin(), color.end(), 0); @@ -108,10 +98,10 @@ MST_solver::MST_solver(const raft::han thrust::sequence(policy, next_color.begin(), next_color.end(), 0); } -template +template raft::Graph_COO -MST_solver::solve() -{ +MST_solver::solve() { RAFT_EXPECTS(v > 0, "0 vertices"); RAFT_EXPECTS(e > 0, "0 edges"); RAFT_EXPECTS(offsets != nullptr, "Null offsets."); @@ -124,13 +114,12 @@ MST_solver::solve() // Alterating the weights // this is done by identifying the lowest cost edge weight gap that is not 0, call this theta. - // For each edge, add noise that is less than theta. That is, generate a random number in the - // range [0.0, theta) and add it to each edge weight. + // For each edge, add noise that is less than theta. That is, generate a random number in the range [0.0, theta) and add it to each edge weight. alteration(); #ifdef MST_TIME auto stop = Clock::now(); - timer0 = duration_us(stop - start); + timer0 = duration_us(stop - start); #endif auto max_mst_edges = symmetrize_output ? 2 * v - 2 : v - 1; @@ -179,8 +168,8 @@ MST_solver::solve() if (curr_mst_edge_count == prev_mst_edge_count[0]) { #ifdef MST_TIME std::cout << "Iterations: " << i << std::endl; - std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 << "," << timer4 << "," - << timer5 << std::endl; + std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 + << "," << timer4 << "," << timer5 << std::endl; #endif // exit here when reaching steady state break; @@ -190,7 +179,8 @@ MST_solver::solve() start = Clock::now(); #endif // append the newly found MST edges to the final output - append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), mst_result.weights.data()); + append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), + mst_result.weights.data()); #ifdef MST_TIME stop = Clock::now(); timer4 += duration_us(stop - start); @@ -211,7 +201,7 @@ MST_solver::solve() // result packaging thrust::host_vector host_mst_edge_count = mst_edge_count; - mst_result.n_edges = host_mst_edge_count[0]; + mst_result.n_edges = host_mst_edge_count[0]; mst_result.src.resize(mst_result.n_edges, stream); mst_result.dst.resize(mst_result.n_edges, stream); mst_result.weights.resize(mst_result.n_edges, stream); @@ -222,46 +212,50 @@ MST_solver::solve() // ||y|-|x|| template struct alteration_functor { - __host__ __device__ weight_t operator()(const thrust::tuple& t) - { + __host__ __device__ weight_t + operator()(const thrust::tuple& t) { auto x = thrust::get<0>(t); auto y = thrust::get<1>(t); - x = x < 0 ? -x : x; - y = y < 0 ? -y : y; + x = x < 0 ? -x : x; + y = y < 0 ? -y : y; return x < y ? y - x : x - y; } }; // Compute the uper bound for the alteration -template -alteration_t MST_solver::alteration_max() -{ +template +alteration_t +MST_solver::alteration_max() { auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); rmm::device_vector tmp(e); thrust::device_ptr weights_ptr(weights); thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin()); - // sort tmp weights + //sort tmp weights thrust::sort(policy, tmp.begin(), tmp.end()); - // remove duplicates + //remove duplicates auto new_end = thrust::unique(policy, tmp.begin(), tmp.end()); - // min(a[i+1]-a[i])/2 - auto begin = thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); - auto end = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); - auto init = tmp[1] - tmp[0]; - auto max = thrust::transform_reduce( - policy, begin, end, alteration_functor(), init, thrust::minimum()); + //min(a[i+1]-a[i])/2 + auto begin = + thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); + auto end = + thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); + auto init = tmp[1] - tmp[0]; + auto max = + thrust::transform_reduce(policy, begin, end, alteration_functor(), + init, thrust::minimum()); return max / static_cast(2); } // Compute the alteration to make all undirected edge weight unique // Preserves weights order -template -void MST_solver::alteration() -{ +template +void MST_solver::alteration() { auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); // maximum alteration that does not change realtive weights order alteration_t max = alteration_max(); @@ -275,32 +269,35 @@ void MST_solver::alteration() curandSetPseudoRandomGeneratorSeed(randGen, 1234567); // Initialize rand values - auto curand_status = curand_generate_uniformX(randGen, rand_values.data().get(), v); + auto curand_status = + curand_generate_uniformX(randGen, rand_values.data().get(), v); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed"); curand_status = curandDestroyGenerator(randGen); - RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND cleanup failed"); + RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, + "MST: CURAND cleanup failed"); - // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu + //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu detail::alteration_kernel<<>>( - v, e, offsets, indices, weights, max, rand_values.data().get(), altered_weights.data().get()); + v, e, offsets, indices, weights, max, rand_values.data().get(), + altered_weights.data().get()); } // updates colors of vertices by propagating the lower color to the higher -template -void MST_solver::label_prop(vertex_t* mst_src, - vertex_t* mst_dst) -{ +template +void MST_solver::label_prop( + vertex_t* mst_src, vertex_t* mst_dst) { // update the colors of both ends its until there is no change in colors thrust::host_vector curr_mst_edge_count = mst_edge_count; auto min_pair_nthreads = std::min(v, (vertex_t)max_threads); - auto min_pair_nblocks = - std::min((v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); + auto min_pair_nblocks = std::min( + (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); rmm::device_vector done(1, false); edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - vertex_t* color_ptr = color.data().get(); + vertex_t* color_ptr = color.data().get(); vertex_t* next_color_ptr = next_color.data().get(); bool* done_ptr = done.data().get(); @@ -317,99 +314,84 @@ void MST_solver::label_prop(vertex_t* i++; } - detail::final_color_indices<<>>( - v, color_ptr, color_index); + detail:: + final_color_indices<<>>( + v, color_ptr, color_index); #ifdef MST_TIME std::cout << "Label prop iterations: " << i << std::endl; #endif } // Finds the minimum edge from each vertex to the lowest color -template -void MST_solver::min_edge_per_vertex() -{ +template +void MST_solver::min_edge_per_vertex() { auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); - thrust::fill( - policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits::max()); - thrust::fill( - policy, new_mst_edge.begin(), new_mst_edge.end(), std::numeric_limits::max()); + thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(), + std::numeric_limits::max()); + thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(), + std::numeric_limits::max()); int n_threads = 32; - vertex_t* color_ptr = color.data().get(); - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - bool* mst_edge_ptr = mst_edge.data().get(); - alteration_t* min_edge_color_ptr = min_edge_color.data().get(); + vertex_t* color_ptr = color.data().get(); + edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); + bool* mst_edge_ptr = mst_edge.data().get(); + alteration_t* min_edge_color_ptr = min_edge_color.data().get(); alteration_t* altered_weights_ptr = altered_weights.data().get(); - detail::kernel_min_edge_per_vertex<<>>(offsets, - indices, - altered_weights_ptr, - color_ptr, - color_index, - new_mst_edge_ptr, - mst_edge_ptr, - min_edge_color_ptr, - v); + detail::kernel_min_edge_per_vertex<<>>( + offsets, indices, altered_weights_ptr, color_ptr, color_index, + new_mst_edge_ptr, mst_edge_ptr, min_edge_color_ptr, v); } // Finds the minimum edge from each supervertex to the lowest color -template -void MST_solver::min_edge_per_supervertex() -{ +template +void MST_solver::min_edge_per_supervertex() { auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); - thrust::fill(policy, temp_src.begin(), temp_src.end(), std::numeric_limits::max()); + thrust::fill(policy, temp_src.begin(), temp_src.end(), + std::numeric_limits::max()); - vertex_t* color_ptr = color.data().get(); - edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); - bool* mst_edge_ptr = mst_edge.data().get(); - alteration_t* min_edge_color_ptr = min_edge_color.data().get(); + vertex_t* color_ptr = color.data().get(); + edge_t* new_mst_edge_ptr = new_mst_edge.data().get(); + bool* mst_edge_ptr = mst_edge.data().get(); + alteration_t* min_edge_color_ptr = min_edge_color.data().get(); alteration_t* altered_weights_ptr = altered_weights.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); - vertex_t* temp_dst_ptr = temp_dst.data().get(); - weight_t* temp_weights_ptr = temp_weights.data().get(); - - detail::min_edge_per_supervertex<<>>(color_ptr, - color_index, - new_mst_edge_ptr, - mst_edge_ptr, - indices, - weights, - altered_weights_ptr, - temp_src_ptr, - temp_dst_ptr, - temp_weights_ptr, - min_edge_color_ptr, - v, - symmetrize_output); + vertex_t* temp_src_ptr = temp_src.data().get(); + vertex_t* temp_dst_ptr = temp_dst.data().get(); + weight_t* temp_weights_ptr = temp_weights.data().get(); + + detail::min_edge_per_supervertex<<>>( + color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights, + altered_weights_ptr, temp_src_ptr, temp_dst_ptr, temp_weights_ptr, + min_edge_color_ptr, v, symmetrize_output); // the above kernel only adds directed mst edges in the case where // a pair of vertices don't pick the same min edge between them // so, now we add the reverse edge to make it undirected if (symmetrize_output) { - detail::add_reverse_edge<<>>(new_mst_edge_ptr, - indices, - weights, - temp_src_ptr, - temp_dst_ptr, - temp_weights_ptr, - v, - symmetrize_output); + detail::add_reverse_edge<<>>( + new_mst_edge_ptr, indices, weights, temp_src_ptr, temp_dst_ptr, + temp_weights_ptr, v, symmetrize_output); } } -template -void MST_solver::check_termination() -{ +template +void MST_solver::check_termination() { vertex_t nthreads = std::min(2 * v, (vertex_t)max_threads); - vertex_t nblocks = std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); + vertex_t nblocks = + std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); // count number of new mst edges edge_t* mst_edge_count_ptr = mst_edge_count.data().get(); - vertex_t* temp_src_ptr = temp_src.data().get(); + vertex_t* temp_src_ptr = temp_src.data().get(); detail::kernel_count_new_mst_edges<<>>( temp_src_ptr, mst_edge_count_ptr, 2 * v); @@ -417,40 +399,36 @@ void MST_solver::check_termination() template struct new_edges_functor { - __host__ __device__ bool operator()(const thrust::tuple& t) - { + __host__ __device__ bool operator()( + const thrust::tuple& t) { auto src = thrust::get<0>(t); return src != std::numeric_limits::max() ? true : false; } }; -template +template void MST_solver::append_src_dst_pair( - vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) -{ + vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) { auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); auto curr_mst_edge_count = prev_mst_edge_count[0]; // iterator to end of mst edges added to final output in previous iteration - auto src_dst_zip_end = - thrust::make_zip_iterator(thrust::make_tuple(mst_src + curr_mst_edge_count, - mst_dst + curr_mst_edge_count, - mst_weights + curr_mst_edge_count)); + auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple( + mst_src + curr_mst_edge_count, mst_dst + curr_mst_edge_count, + mst_weights + curr_mst_edge_count)); // iterator to new mst edges found - auto temp_src_dst_zip_begin = thrust::make_zip_iterator( - thrust::make_tuple(temp_src.begin(), temp_dst.begin(), temp_weights.begin())); + auto temp_src_dst_zip_begin = thrust::make_zip_iterator(thrust::make_tuple( + temp_src.begin(), temp_dst.begin(), temp_weights.begin())); auto temp_src_dst_zip_end = thrust::make_zip_iterator( thrust::make_tuple(temp_src.end(), temp_dst.end(), temp_weights.end())); // copy new mst edges to final output - thrust::copy_if(policy, - temp_src_dst_zip_begin, - temp_src_dst_zip_end, - src_dst_zip_end, - new_edges_functor()); + thrust::copy_if(policy, temp_src_dst_zip_begin, temp_src_dst_zip_end, + src_dst_zip_end, new_edges_functor()); } } // namespace mst diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh index 24127c993f..8f755de459 100644 --- a/cpp/include/raft/sparse/mst/detail/utils.cuh +++ b/cpp/include/raft/sparse/mst/detail/utils.cuh @@ -26,29 +26,32 @@ namespace mst { namespace detail { template -__device__ idx_t get_1D_idx() -{ +__device__ idx_t get_1D_idx() { return blockIdx.x * blockDim.x + threadIdx.x; } // somewhat smart vector print template -void printv(rmm::device_vector& vec, const std::string& name = "", const size_t displ = 5) -{ +void printv(rmm::device_vector& vec, const std::string& name = "", + const size_t displ = 5) { #ifdef MST_TIME std::cout.precision(15); std::cout << name << " size = " << vec.size() << std::endl; if (displ < vec.size()) { - thrust::copy(vec.begin(), vec.begin() + displ, std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.begin() + displ, + std::ostream_iterator(std::cout, " ")); std::cout << " ... "; - thrust::copy(vec.end() - displ, vec.end(), std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.end() - displ, vec.end(), + std::ostream_iterator(std::cout, " ")); } else { - thrust::copy(vec.begin(), vec.end(), std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.end(), + std::ostream_iterator(std::cout, " ")); } std::cout << std::endl << std::endl; #endif } -#define duration_us(a) std::chrono::duration_cast(a).count() +#define duration_us(a) \ + std::chrono::duration_cast(a).count() } // namespace detail } // namespace mst diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh index b49003467b..10c981445e 100644 --- a/cpp/include/raft/sparse/mst/mst.cuh +++ b/cpp/include/raft/sparse/mst/mst.cuh @@ -22,30 +22,16 @@ namespace raft { namespace mst { -template -raft::Graph_COO mst(const raft::handle_t& handle, - edge_t const* offsets, - vertex_t const* indices, - weight_t const* weights, - vertex_t const v, - edge_t const e, - vertex_t* color, - cudaStream_t stream, - bool symmetrize_output = true, - bool initialize_colors = true, - int iterations = 0) -{ - MST_solver mst_solver(handle, - offsets, - indices, - weights, - v, - e, - color, - stream, - symmetrize_output, - initialize_colors, - iterations); +template +raft::Graph_COO mst( + const raft::handle_t& handle, edge_t const* offsets, vertex_t const* indices, + weight_t const* weights, vertex_t const v, edge_t const e, vertex_t* color, + cudaStream_t stream, bool symmetrize_output = true, + bool initialize_colors = true, int iterations = 0) { + MST_solver mst_solver( + handle, offsets, indices, weights, v, e, color, stream, symmetrize_output, + initialize_colors, iterations); return mst_solver.solve(); } diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh index e32bcfacac..833882ea0d 100644 --- a/cpp/include/raft/sparse/mst/mst_solver.cuh +++ b/cpp/include/raft/sparse/mst/mst_solver.cuh @@ -31,27 +31,20 @@ struct Graph_COO { edge_t n_edges; Graph_COO(vertex_t size, cudaStream_t stream) - : src(size, stream), dst(size, stream), weights(size, stream) - { - } + : src(size, stream), dst(size, stream), weights(size, stream) {} }; namespace mst { -template +template class MST_solver { public: - MST_solver(const raft::handle_t& handle_, - const edge_t* offsets_, - const vertex_t* indices_, - const weight_t* weights_, - const vertex_t v_, - const edge_t e_, - vertex_t* color_, - cudaStream_t stream_, - bool symmetrize_output_, - bool initialize_colors_, - int iterations_); + MST_solver(const raft::handle_t& handle_, const edge_t* offsets_, + const vertex_t* indices_, const weight_t* weights_, + const vertex_t v_, const edge_t e_, vertex_t* color_, + cudaStream_t stream_, bool symmetrize_output_, + bool initialize_colors_, int iterations_); raft::Graph_COO solve(); @@ -63,7 +56,7 @@ class MST_solver { bool symmetrize_output, initialize_colors; int iterations; - // CSR + //CSR const edge_t* offsets; const vertex_t* indices; const weight_t* weights; @@ -74,16 +67,20 @@ class MST_solver { vertex_t max_threads; vertex_t sm_count; - vertex_t* color_index; // represent each supervertex as a color - rmm::device_vector min_edge_color; // minimum incident edge weight per color - rmm::device_vector new_mst_edge; // new minimum edge per vertex - rmm::device_vector altered_weights; // weights to be used for mst - rmm::device_vector mst_edge_count; // total number of edges added after every iteration + vertex_t* color_index; // represent each supervertex as a color + rmm::device_vector + min_edge_color; // minimum incident edge weight per color + rmm::device_vector new_mst_edge; // new minimum edge per vertex + rmm::device_vector + altered_weights; // weights to be used for mst rmm::device_vector - prev_mst_edge_count; // total number of edges up to the previous iteration - rmm::device_vector mst_edge; // mst output - true if the edge belongs in mst + mst_edge_count; // total number of edges added after every iteration + rmm::device_vector + prev_mst_edge_count; // total number of edges up to the previous iteration + rmm::device_vector + mst_edge; // mst output - true if the edge belongs in mst rmm::device_vector next_color; // next iteration color - rmm::device_vector color; // index of color that vertex points to + rmm::device_vector color; // index of color that vertex points to // new src-dst pairs found per iteration rmm::device_vector temp_src; @@ -96,7 +93,8 @@ class MST_solver { void check_termination(); void alteration(); alteration_t alteration_max(); - void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights); + void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, + weight_t* mst_weights); }; } // namespace mst diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 397fecaaea..562d506cfe 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -42,23 +42,15 @@ namespace sparse { namespace op { template -__global__ void coo_remove_scalar_kernel(const int* rows, - const int* cols, - const T* vals, - int nnz, - int* crows, - int* ccols, - T* cvals, - int* ex_scan, - int* cur_ex_scan, - int m, - T scalar) -{ +__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, + const T *vals, int nnz, int *crows, + int *ccols, T *cvals, int *ex_scan, + int *cur_ex_scan, int m, T scalar) { int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { - int start = cur_ex_scan[row]; - int stop = get_stop_idx(row, m, nnz, cur_ex_scan); + int start = cur_ex_scan[row]; + int stop = get_stop_idx(row, m, nnz, cur_ex_scan); int cur_out_idx = ex_scan[row]; for (int idx = start; idx < stop; idx++) { @@ -90,51 +82,37 @@ __global__ void coo_remove_scalar_kernel(const int* rows, * @param stream: cuda stream to use */ template -void coo_remove_scalar(const int* rows, - const int* cols, - const T* vals, - int nnz, - int* crows, - int* ccols, - T* cvals, - int* cnnz, - int* cur_cnnz, - T scalar, - int n, +void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, + int *crows, int *ccols, T *cvals, int *cnnz, + int *cur_cnnz, T scalar, int n, std::shared_ptr d_alloc, - cudaStream_t stream) -{ + cudaStream_t stream) { raft::mr::device::buffer ex_scan(d_alloc, stream, n); raft::mr::device::buffer cur_ex_scan(d_alloc, stream, n); CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); - thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); - thrust::device_ptr dev_ex_scan = thrust::device_pointer_cast(ex_scan.data()); - thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan); + thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); + thrust::device_ptr dev_ex_scan = + thrust::device_pointer_cast(ex_scan.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, + dev_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); - thrust::device_ptr dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data()); - thrust::exclusive_scan( - thrust::cuda::par.on(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); + thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); + thrust::device_ptr dev_cur_ex_scan = + thrust::device_pointer_cast(cur_ex_scan.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz, + dev_cur_cnnz + n, dev_cur_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); dim3 grid(raft::ceildiv(n, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - coo_remove_scalar_kernel<<>>(rows, - cols, - vals, - nnz, - crows, - ccols, - cvals, - dev_ex_scan.get(), - dev_cur_ex_scan.get(), - n, - scalar); + coo_remove_scalar_kernel<<>>( + rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(), + dev_cur_ex_scan.get(), n, scalar); CUDA_CHECK(cudaPeekAtLastError()); } @@ -148,44 +126,35 @@ void coo_remove_scalar(const int* rows, * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO* in, - COO* out, - T scalar, +void coo_remove_scalar(COO *in, COO *out, T scalar, std::shared_ptr d_alloc, - cudaStream_t stream) -{ + cudaStream_t stream) { raft::mr::device::buffer row_count_nz(d_alloc, stream, in->n_rows); raft::mr::device::buffer row_count(d_alloc, stream, in->n_rows); - CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK( + cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK( + cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - linalg::coo_degree_scalar( - in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream); + linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, + row_count_nz.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data()); - int out_nnz = - thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, d_row_count_nz + in->n_rows); + thrust::device_ptr d_row_count_nz = + thrust::device_pointer_cast(row_count_nz.data()); + int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, + d_row_count_nz + in->n_rows); out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream); - coo_remove_scalar(in->rows(), - in->cols(), - in->vals(), - in->nnz, - out->rows(), - out->cols(), - out->vals(), - row_count_nz.data(), - row_count.data(), - scalar, - in->n_rows, - d_alloc, - stream); + coo_remove_scalar(in->rows(), in->cols(), in->vals(), in->nnz, + out->rows(), out->cols(), out->vals(), + row_count_nz.data(), row_count.data(), scalar, + in->n_rows, d_alloc, stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -198,11 +167,9 @@ void coo_remove_scalar(COO* in, * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO* in, - COO* out, +void coo_remove_zeros(COO *in, COO *out, std::shared_ptr d_alloc, - cudaStream_t stream) -{ + cudaStream_t stream) { coo_remove_scalar(in, out, T(0.0), d_alloc, stream); } diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index bc4d7bace5..53c9f89074 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -46,29 +46,25 @@ namespace sparse { namespace op { template -__global__ void compute_duplicates_diffs_kernel(const value_idx* rows, - const value_idx* cols, - value_idx* diff, - size_t nnz) -{ +__global__ void compute_duplicates_diffs_kernel(const value_idx *rows, + const value_idx *cols, + value_idx *diff, size_t nnz) { size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; value_idx d = 1; - if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) d = 0; + if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) + d = 0; diff[tid] = d; } template -__global__ void max_duplicates_kernel(const value_idx* src_rows, - const value_idx* src_cols, - const value_t* src_vals, - const value_idx* index, - value_idx* out_rows, - value_idx* out_cols, - value_t* out_vals, - size_t nnz) -{ +__global__ void max_duplicates_kernel(const value_idx *src_rows, + const value_idx *src_cols, + const value_t *src_vals, + const value_idx *index, + value_idx *out_rows, value_idx *out_cols, + value_t *out_vals, size_t nnz) { size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < nnz) { @@ -100,13 +96,13 @@ __global__ void max_duplicates_kernel(const value_idx* src_rows, * @param[in] stream cuda ops will be ordered wrt this stream */ template -void compute_duplicates_mask( - value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream) -{ +void compute_duplicates_mask(value_idx *mask, const value_idx *rows, + const value_idx *cols, size_t nnz, + cudaStream_t stream) { CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream)); - compute_duplicates_diffs_kernel<<>>( - rows, cols, mask, nnz); + compute_duplicates_diffs_kernel<<>>(rows, cols, mask, nnz); } /** @@ -126,17 +122,12 @@ void compute_duplicates_mask( * @param[in] stream cuda ops will be ordered wrt this stream */ template -void max_duplicates(const raft::handle_t& handle, - raft::sparse::COO& out, - const value_idx* rows, - const value_idx* cols, - const value_t* vals, - size_t nnz, - size_t m, - size_t n) -{ +void max_duplicates(const raft::handle_t &handle, + raft::sparse::COO &out, + const value_idx *rows, const value_idx *cols, + const value_t *vals, size_t nnz, size_t m, size_t n) { auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream}); @@ -145,8 +136,8 @@ void max_duplicates(const raft::handle_t& handle, compute_duplicates_mask(diff.data(), rows, cols, nnz, stream); - thrust::exclusive_scan( - thrust::cuda::par.on(stream), diff.data(), diff.data() + diff.size(), diff.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(), + diff.data() + diff.size(), diff.data()); // compute final size value_idx size = 0; diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh index 194a878ac1..9e5034dc28 100644 --- a/cpp/include/raft/sparse/op/row_op.cuh +++ b/cpp/include/raft/sparse/op/row_op.cuh @@ -38,12 +38,12 @@ namespace sparse { namespace op { template void> -__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op) -{ +__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, + Lambda op) { T row = blockIdx.x * TPB_X + threadIdx.x; if (row < n_rows) { T start_idx = row_ind[row]; - T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; + T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; op(row, start_idx, stop_idx); } } @@ -59,12 +59,14 @@ __global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op) * @param op custom row operation functor accepting the row and beginning index. * @param stream cuda stream to use */ -template void> -void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream) -{ +template void> +void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op, + cudaStream_t stream) { dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_op_kernel<<>>(row_ind, n_rows, nnz, op); + csr_row_op_kernel + <<>>(row_ind, n_rows, nnz, op); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h index 9bbe04cf34..46f4f41879 100644 --- a/cpp/include/raft/sparse/op/slice.h +++ b/cpp/include/raft/sparse/op/slice.h @@ -50,14 +50,10 @@ namespace op { * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_indptr(value_idx start_row, - value_idx stop_row, - const value_idx* indptr, - value_idx* indptr_out, - value_idx* start_offset, - value_idx* stop_offset, - cudaStream_t stream) -{ +void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, + const value_idx *indptr, value_idx *indptr_out, + value_idx *start_offset, value_idx *stop_offset, + cudaStream_t stream) { raft::update_host(start_offset, indptr + start_row, 1, stream); raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream); @@ -67,12 +63,11 @@ void csr_row_slice_indptr(value_idx start_row, // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1, // we add another 1 to stop row. - raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream); + raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, + stream); raft::linalg::unaryOp( - indptr_out, - indptr_out, - (stop_row + 2) - start_row, + indptr_out, indptr_out, (stop_row + 2) - start_row, [s_offset] __device__(value_idx input) { return input - s_offset; }, stream); } @@ -90,15 +85,12 @@ void csr_row_slice_indptr(value_idx start_row, * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_populate(value_idx start_offset, - value_idx stop_offset, - const value_idx* indices, - const value_t* data, - value_idx* indices_out, - value_t* data_out, - cudaStream_t stream) -{ - raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, stream); +void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset, + const value_idx *indices, const value_t *data, + value_idx *indices_out, value_t *data_out, + cudaStream_t stream) { + raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, + stream); raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream); } diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index 3cab24fc09..9dbe2b67c5 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -42,8 +42,7 @@ namespace op { struct TupleComp { template - __host__ __device__ bool operator()(const one& t1, const two& t2) - { + __host__ __device__ bool operator()(const one &t1, const two &t2) { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -67,21 +66,15 @@ struct TupleComp { * @param stream: cuda stream to use */ template -void coo_sort(int m, - int n, - int nnz, - int* rows, - int* cols, - T* vals, +void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, // TODO: Remove this std::shared_ptr d_alloc, - cudaStream_t stream) -{ + cudaStream_t stream) { auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key( - thrust::cuda::par.on(stream), coo_indices, coo_indices + nnz, vals, TupleComp()); + thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices, + coo_indices + nnz, vals, TupleComp()); } /** @@ -92,12 +85,12 @@ void coo_sort(int m, * @param stream: the cuda stream to use */ template -void coo_sort(COO* const in, +void coo_sort(COO *const in, // TODO: Remove this std::shared_ptr d_alloc, - cudaStream_t stream) -{ - coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), d_alloc, stream); + cudaStream_t stream) { + coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), + in->vals(), d_alloc, stream); } /** @@ -111,16 +104,16 @@ void coo_sort(COO* const in, * @param[in] stream cuda stream for which to order cuda operations */ template -void coo_sort_by_weight( - value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream) -{ +void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, + value_idx nnz, cudaStream_t stream) { thrust::device_ptr t_rows = thrust::device_pointer_cast(rows); thrust::device_ptr t_cols = thrust::device_pointer_cast(cols); - thrust::device_ptr t_data = thrust::device_pointer_cast(data); + thrust::device_ptr t_data = thrust::device_pointer_cast(data); auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); - thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, first); + thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, + first); } }; // namespace op }; // end NAMESPACE sparse diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index ec8bec6eb3..8aae90f1d8 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -59,20 +59,17 @@ struct KeyValuePair { __host__ __device__ __forceinline__ KeyValuePair() {} /// Copy Constructor - __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) - : key(kvp.key), value(kvp.value) - { - } + __host__ __device__ __forceinline__ + KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) + : key(kvp.key), value(kvp.value) {} /// Constructor - __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) - : key(key), value(value) - { - } + __host__ __device__ __forceinline__ KeyValuePair(Key const &key, + Value const &value) + : key(key), value(value) {} /// Inequality operator - __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) - { + __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair &b) { return (value != b.value) || (key != b.key); } }; @@ -86,32 +83,31 @@ struct KeyValuePair { */ template struct FixConnectivitiesRedOp { - value_idx* colors; + value_idx *colors; value_idx m; - FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){}; + FixConnectivitiesRedOp(value_idx *colors_, value_idx m_) + : colors(colors_), m(m_){}; typedef typename cub::KeyValuePair KVP; - DI void operator()(value_idx rit, KVP* out, const KVP& other) - { - if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) { - out->key = other.key; + DI void operator()(value_idx rit, KVP *out, const KVP &other) { + if (rit < m && other.value < out->value && + colors[rit] != colors[other.key]) { + out->key = other.key; out->value = other.value; } } - DI KVP operator()(value_idx rit, const KVP& a, const KVP& b) - { + DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) { if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) { return a; } else return b; } - DI void init(value_t* out, value_t maxVal) { *out = maxVal; } - DI void init(KVP* out, value_t maxVal) - { - out->key = -1; + DI void init(value_t *out, value_t maxVal) { *out = maxVal; } + DI void init(KVP *out, value_t maxVal) { + out->key = -1; out->value = maxVal; } }; @@ -123,8 +119,7 @@ struct FixConnectivitiesRedOp { */ struct TupleComp { template - __host__ __device__ bool operator()(const one& t1, const two& t2) - { + __host__ __device__ bool operator()(const one &t1, const two &t2) { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -142,9 +137,13 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } + DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { + return b.value < a.value ? b : a; + } - DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } + DI KVP operator()(const KVP &a, const KVP &b) { + return b.value < a.value ? b : a; + } }; // KVPMinReduce @@ -159,14 +158,13 @@ struct CubKVPMinReduce { * @return total number of components */ template -value_idx get_n_components(value_idx* colors, - size_t n_rows, +value_idx get_n_components(value_idx *colors, size_t n_rows, std::shared_ptr d_alloc, - cudaStream_t stream) -{ - value_idx* map_ids; + cudaStream_t stream) { + value_idx *map_ids; int num_clusters; - raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, d_alloc); + raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, + d_alloc); d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream); return num_clusters; @@ -179,12 +177,11 @@ value_idx get_n_components(value_idx* colors, */ template struct LookupColorOp { - value_idx* colors; + value_idx *colors; - LookupColorOp(value_idx* colors_) : colors(colors_) {} + LookupColorOp(value_idx *colors_) : colors(colors_) {} - DI value_idx operator()(const cub::KeyValuePair& kvp) - { + DI value_idx operator()(const cub::KeyValuePair &kvp) { return colors[kvp.key]; } }; @@ -194,8 +191,7 @@ struct LookupColorOp { * the given array of components * @tparam value_idx * @tparam value_t - * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given - * array of components + * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given array of components * @param[out] nn_colors components of nearest neighbors for each vertex * @param[in] colors components of each vertex * @param[in] X original dense data @@ -205,39 +201,25 @@ struct LookupColorOp { * @param[in] stream cuda stream for which to order cuda operations */ template -void perform_1nn(cub::KeyValuePair* kvp, - value_idx* nn_colors, - value_idx* colors, - const value_t* X, - size_t n_rows, - size_t n_cols, +void perform_1nn(cub::KeyValuePair *kvp, + value_idx *nn_colors, value_idx *colors, const value_t *X, + size_t n_rows, size_t n_cols, std::shared_ptr d_alloc, - cudaStream_t stream, - red_op reduction_op) -{ + cudaStream_t stream, red_op reduction_op) { rmm::device_uvector workspace(n_rows, stream); rmm::device_uvector x_norm(n_rows, stream); - raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream); - - raft::distance::fusedL2NN, value_idx>( - kvp, - X, - X, - x_norm.data(), - x_norm.data(), - n_rows, - n_rows, - n_cols, - workspace.data(), - reduction_op, - reduction_op, - true, - true, - stream); + raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, + true, stream); + + raft::distance::fusedL2NN, + value_idx>( + kvp, X, X, x_norm.data(), x_norm.data(), n_rows, n_rows, n_cols, + workspace.data(), reduction_op, reduction_op, true, true, stream); LookupColorOp extract_colors_op(colors); - thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op); + thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, + extract_colors_op); } /** @@ -253,33 +235,27 @@ void perform_1nn(cub::KeyValuePair* kvp, * @param stream stream for which to order CUDA operations */ template -void sort_by_color(value_idx* colors, - value_idx* nn_colors, - cub::KeyValuePair* kvp, - value_idx* src_indices, - size_t n_rows, - cudaStream_t stream) -{ +void sort_by_color(value_idx *colors, value_idx *nn_colors, + cub::KeyValuePair *kvp, + value_idx *src_indices, size_t n_rows, cudaStream_t stream) { thrust::counting_iterator arg_sort_iter(0); - thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices); + thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, + arg_sort_iter + n_rows, src_indices); - auto keys = thrust::make_zip_iterator( - thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair*)kvp)); + auto keys = thrust::make_zip_iterator(thrust::make_tuple( + colors, nn_colors, (raft::linkage::KeyValuePair *)kvp)); auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, TupleComp()); + thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, + TupleComp()); } template -__global__ void min_components_by_color_kernel(value_idx* out_rows, - value_idx* out_cols, - value_t* out_vals, - const value_idx* out_index, - const value_idx* indices, - const cub::KeyValuePair* kvp, - size_t nnz) -{ +__global__ void min_components_by_color_kernel( + value_idx *out_rows, value_idx *out_cols, value_t *out_vals, + const value_idx *out_index, const value_idx *indices, + const cub::KeyValuePair *kvp, size_t nnz) { size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -308,20 +284,19 @@ __global__ void min_components_by_color_kernel(value_idx* out_rows, * @param[in] stream cuda stream for which to order cuda operations */ template -void min_components_by_color(raft::sparse::COO& coo, - const value_idx* out_index, - const value_idx* indices, - const cub::KeyValuePair* kvp, - size_t nnz, - cudaStream_t stream) -{ +void min_components_by_color(raft::sparse::COO &coo, + const value_idx *out_index, + const value_idx *indices, + const cub::KeyValuePair *kvp, + size_t nnz, cudaStream_t stream) { /** * Arrays should be ordered by: colors_indptr->colors_n->kvp.value * so the last element of each column in the input CSR should be * the min. */ - min_components_by_color_kernel<<>>( - coo.rows(), coo.cols(), coo.vals(), out_index, indices, kvp, nnz); + min_components_by_color_kernel<<>>(coo.rows(), coo.cols(), coo.vals(), + out_index, indices, kvp, nnz); } /** @@ -343,18 +318,14 @@ void min_components_by_color(raft::sparse::COO& coo, * @param[in] n_cols number of cols in X */ template -void connect_components( - const raft::handle_t& handle, - raft::sparse::COO& out, - const value_t* X, - const value_idx* orig_colors, - size_t n_rows, - size_t n_cols, - red_op reduction_op, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) -{ +void connect_components(const raft::handle_t &handle, + raft::sparse::COO &out, + const value_t *X, const value_idx *orig_colors, + size_t n_rows, size_t n_cols, red_op reduction_op, + raft::distance::DistanceType metric = + raft::distance::DistanceType::L2SqrtExpanded) { auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, "Fixing connectivities for an unconnected k-NN graph only " @@ -364,52 +335,47 @@ void connect_components( raft::copy_async(colors.data(), orig_colors, n_rows, stream); // Normalize colors so they are drawn from a monotonically increasing set - raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, d_alloc, true); + raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, + d_alloc, true); - value_idx n_components = get_n_components(colors.data(), n_rows, d_alloc, stream); + value_idx n_components = + get_n_components(colors.data(), n_rows, d_alloc, stream); /** * First compute 1-nn for all colors where the color of each data point * is guaranteed to be != color of its nearest neighbor. */ rmm::device_uvector nn_colors(n_rows, stream); - rmm::device_uvector> temp_inds_dists(n_rows, stream); + rmm::device_uvector> temp_inds_dists( + n_rows, stream); rmm::device_uvector src_indices(n_rows, stream); - perform_1nn(temp_inds_dists.data(), - nn_colors.data(), - colors.data(), - X, - n_rows, - n_cols, - d_alloc, - stream, - reduction_op); + perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X, + n_rows, n_cols, d_alloc, stream, reduction_op); /** * Sort data points by color (neighbors are not sorted) */ // max_color + 1 = number of connected components // sort nn_colors by key w/ original colors - sort_by_color( - colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream); + sort_by_color(colors.data(), nn_colors.data(), temp_inds_dists.data(), + src_indices.data(), n_rows, stream); /** * Take the min for any duplicate colors */ // Compute mask of duplicates rmm::device_uvector out_index(n_rows + 1, stream); - raft::sparse::op::compute_duplicates_mask( - out_index.data(), colors.data(), nn_colors.data(), n_rows, stream); + raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(), + nn_colors.data(), n_rows, stream); - thrust::exclusive_scan(thrust::cuda::par.on(stream), - out_index.data(), - out_index.data() + out_index.size(), - out_index.data()); + thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(), + out_index.data() + out_index.size(), out_index.data()); // compute final size value_idx size = 0; - raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream); + raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); size++; @@ -417,14 +383,14 @@ void connect_components( raft::sparse::COO min_edges(d_alloc, stream); min_edges.allocate(size, n_rows, n_rows, true, stream); - min_components_by_color( - min_edges, out_index.data(), src_indices.data(), temp_inds_dists.data(), n_rows, stream); + min_components_by_color(min_edges, out_index.data(), src_indices.data(), + temp_inds_dists.data(), n_rows, stream); /** * Symmetrize resulting edge list */ - raft::sparse::linalg::symmetrize( - handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out); + raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(), + min_edges.vals(), n_rows, n_rows, size, out); } }; // end namespace linkage diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index dbb24ee334..71fbb8ab3d 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -49,11 +49,9 @@ namespace selection { template struct csr_batcher_t { - csr_batcher_t(value_idx batch_size, - value_idx n_rows, - const value_idx* csr_indptr, - const value_idx* csr_indices, - const value_t* csr_data) + csr_batcher_t(value_idx batch_size, value_idx n_rows, + const value_idx *csr_indptr, const value_idx *csr_indices, + const value_t *csr_data) : batch_start_(0), batch_stop_(0), batch_rows_(0), @@ -63,42 +61,32 @@ struct csr_batcher_t { csr_indices_(csr_indices), csr_data_(csr_data), batch_csr_start_offset_(0), - batch_csr_stop_offset_(0) - { - } + batch_csr_stop_offset_(0) {} - void set_batch(int batch_num) - { + void set_batch(int batch_num) { batch_start_ = batch_num * batch_size_; - batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing + batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing - if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1; // zero-based indexing + if (batch_stop_ >= total_rows_) + batch_stop_ = total_rows_ - 1; // zero-based indexing batch_rows_ = (batch_stop_ - batch_start_) + 1; } - value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream) - { - raft::sparse::op::csr_row_slice_indptr(batch_start_, - batch_stop_, - csr_indptr_, - batch_indptr, - &batch_csr_start_offset_, - &batch_csr_stop_offset_, - stream); + value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr, + cudaStream_t stream) { + raft::sparse::op::csr_row_slice_indptr( + batch_start_, batch_stop_, csr_indptr_, batch_indptr, + &batch_csr_start_offset_, &batch_csr_stop_offset_, stream); return batch_csr_stop_offset_ - batch_csr_start_offset_; } - void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream) - { - raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_, - batch_csr_stop_offset_, - csr_indices_, - csr_data_, - csr_indices, - csr_data, - stream); + void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data, + cudaStream_t stream) { + raft::sparse::op::csr_row_slice_populate( + batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_, + csr_indices, csr_data, stream); } value_idx batch_rows() const { return batch_rows_; } @@ -115,9 +103,9 @@ struct csr_batcher_t { value_idx total_rows_; - const value_idx* csr_indptr_; - const value_idx* csr_indices_; - const value_t* csr_data_; + const value_idx *csr_indptr_; + const value_idx *csr_indices_; + const value_t *csr_data_; value_idx batch_csr_start_offset_; value_idx batch_csr_stop_offset_; @@ -126,26 +114,18 @@ struct csr_batcher_t { template class sparse_knn_t { public: - sparse_knn_t(const value_idx* idxIndptr_, - const value_idx* idxIndices_, - const value_t* idxData_, - size_t idxNNZ_, - int n_idx_rows_, - int n_idx_cols_, - const value_idx* queryIndptr_, - const value_idx* queryIndices_, - const value_t* queryData_, - size_t queryNNZ_, - int n_query_rows_, - int n_query_cols_, - value_idx* output_indices_, - value_t* output_dists_, - int k_, - const raft::handle_t& handle_, - size_t batch_size_index_ = 2 << 14, // approx 1M - size_t batch_size_query_ = 2 << 14, - raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded, - float metricArg_ = 0) + sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_, + const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_, + int n_idx_cols_, const value_idx *queryIndptr_, + const value_idx *queryIndices_, const value_t *queryData_, + size_t queryNNZ_, int n_query_rows_, int n_query_cols_, + value_idx *output_indices_, value_t *output_dists_, int k_, + const raft::handle_t &handle_, + size_t batch_size_index_ = 2 << 14, // approx 1M + size_t batch_size_query_ = 2 << 14, + raft::distance::DistanceType metric_ = + raft::distance::DistanceType::L2Expanded, + float metricArg_ = 0) : idxIndptr(idxIndptr_), idxIndices(idxIndices_), idxData(idxData_), @@ -165,12 +145,9 @@ class sparse_knn_t { batch_size_index(batch_size_index_), batch_size_query(batch_size_query_), metric(metric_), - metricArg(metricArg_) - { - } + metricArg(metricArg_) {} - void run() - { + void run() { using namespace raft::sparse; int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query); @@ -181,33 +158,37 @@ class sparse_knn_t { for (int i = 0; i < n_batches_query; i++) { /** - * Compute index batch info - */ + * Compute index batch info + */ query_batcher.set_batch(i); /** - * Slice CSR to rows in batch - */ + * Slice CSR to rows in batch + */ - rmm::device_uvector query_batch_indptr(query_batcher.batch_rows() + 1, - handle.get_stream()); + rmm::device_uvector query_batch_indptr( + query_batcher.batch_rows() + 1, handle.get_stream()); - value_idx n_query_batch_nnz = - query_batcher.get_batch_csr_indptr_nnz(query_batch_indptr.data(), handle.get_stream()); + value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz( + query_batch_indptr.data(), handle.get_stream()); - rmm::device_uvector query_batch_indices(n_query_batch_nnz, handle.get_stream()); - rmm::device_uvector query_batch_data(n_query_batch_nnz, handle.get_stream()); + rmm::device_uvector query_batch_indices(n_query_batch_nnz, + handle.get_stream()); + rmm::device_uvector query_batch_data(n_query_batch_nnz, + handle.get_stream()); - query_batcher.get_batch_csr_indices_data( - query_batch_indices.data(), query_batch_data.data(), handle.get_stream()); + query_batcher.get_batch_csr_indices_data(query_batch_indices.data(), + query_batch_data.data(), + handle.get_stream()); // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent // batches and 1 space for the results of the merge, which get copied back to the top - rmm::device_uvector merge_buffer_indices(0, handle.get_stream()); + rmm::device_uvector merge_buffer_indices(0, + handle.get_stream()); rmm::device_uvector merge_buffer_dists(0, handle.get_stream()); - value_t* dists_merge_buffer_ptr; - value_idx* indices_merge_buffer_ptr; + value_t *dists_merge_buffer_ptr; + value_idx *indices_merge_buffer_ptr; int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index); csr_batcher_t idx_batcher( @@ -216,19 +197,22 @@ class sparse_knn_t { for (int j = 0; j < n_batches_idx; j++) { idx_batcher.set_batch(j); - merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); - merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); + merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, + handle.get_stream()); + merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, + handle.get_stream()); /** - * Slice CSR to rows in batch - */ - rmm::device_uvector idx_batch_indptr(idx_batcher.batch_rows() + 1, - handle.get_stream()); - rmm::device_uvector idx_batch_indices(0, handle.get_stream()); + * Slice CSR to rows in batch + */ + rmm::device_uvector idx_batch_indptr( + idx_batcher.batch_rows() + 1, handle.get_stream()); + rmm::device_uvector idx_batch_indices(0, + handle.get_stream()); rmm::device_uvector idx_batch_data(0, handle.get_stream()); - value_idx idx_batch_nnz = - idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), handle.get_stream()); + value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz( + idx_batch_indptr.data(), handle.get_stream()); idx_batch_indices.resize(idx_batch_nnz, handle.get_stream()); idx_batch_data.resize(idx_batch_nnz, handle.get_stream()); @@ -237,126 +221,111 @@ class sparse_knn_t { idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream()); /** - * Compute distances - */ - size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows(); - rmm::device_uvector batch_dists(dense_size, handle.get_stream()); - - CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); - - compute_distances(idx_batcher, - query_batcher, - idx_batch_nnz, - n_query_batch_nnz, - idx_batch_indptr.data(), - idx_batch_indices.data(), - idx_batch_data.data(), - query_batch_indptr.data(), - query_batch_indices.data(), - query_batch_data.data(), - batch_dists.data()); + * Compute distances + */ + size_t dense_size = + idx_batcher.batch_rows() * query_batcher.batch_rows(); + rmm::device_uvector batch_dists(dense_size, + handle.get_stream()); + + CUDA_CHECK(cudaMemset(batch_dists.data(), 0, + batch_dists.size() * sizeof(value_t))); + + compute_distances(idx_batcher, query_batcher, idx_batch_nnz, + n_query_batch_nnz, idx_batch_indptr.data(), + idx_batch_indices.data(), idx_batch_data.data(), + query_batch_indptr.data(), query_batch_indices.data(), + query_batch_data.data(), batch_dists.data()); // Build batch indices array - rmm::device_uvector batch_indices(batch_dists.size(), handle.get_stream()); + rmm::device_uvector batch_indices(batch_dists.size(), + handle.get_stream()); // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), + batch_cols = idx_batcher.batch_rows(); - iota_fill(batch_indices.data(), batch_rows, batch_cols, handle.get_stream()); + iota_fill(batch_indices.data(), batch_rows, batch_cols, + handle.get_stream()); /** * Perform k-selection on batch & merge with other k-selections */ size_t merge_buffer_offset = batch_rows * k; - dists_merge_buffer_ptr = merge_buffer_dists.data() + merge_buffer_offset; - indices_merge_buffer_ptr = merge_buffer_indices.data() + merge_buffer_offset; - - perform_k_selection(idx_batcher, - query_batcher, - batch_dists.data(), - batch_indices.data(), - dists_merge_buffer_ptr, + dists_merge_buffer_ptr = + merge_buffer_dists.data() + merge_buffer_offset; + indices_merge_buffer_ptr = + merge_buffer_indices.data() + merge_buffer_offset; + + perform_k_selection(idx_batcher, query_batcher, batch_dists.data(), + batch_indices.data(), dists_merge_buffer_ptr, indices_merge_buffer_ptr); - value_t* dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; - value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; + value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; + value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; // Merge results of difference batches if necessary if (idx_batcher.batch_start() > 0) { - size_t merge_buffer_tmp_out = batch_rows * k * 2; - dists_merge_buffer_tmp_ptr = merge_buffer_dists.data() + merge_buffer_tmp_out; - indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out; - - merge_batches(idx_batcher, - query_batcher, - merge_buffer_dists.data(), - merge_buffer_indices.data(), - dists_merge_buffer_tmp_ptr, + size_t merge_buffer_tmp_out = batch_rows * k * 2; + dists_merge_buffer_tmp_ptr = + merge_buffer_dists.data() + merge_buffer_tmp_out; + indices_merge_buffer_tmp_ptr = + merge_buffer_indices.data() + merge_buffer_tmp_out; + + merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(), + merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr, indices_merge_buffer_tmp_ptr); } // copy merged output back into merge buffer partition for next iteration raft::copy_async(merge_buffer_indices.data(), indices_merge_buffer_tmp_ptr, - batch_rows * k, - handle.get_stream()); + batch_rows * k, handle.get_stream()); raft::copy_async(merge_buffer_dists.data(), - dists_merge_buffer_tmp_ptr, - batch_rows * k, + dists_merge_buffer_tmp_ptr, batch_rows * k, handle.get_stream()); } // Copy final merged batch to output array - raft::copy_async(output_indices + (rows_processed * k), - merge_buffer_indices.data(), - query_batcher.batch_rows() * k, - handle.get_stream()); - raft::copy_async(output_dists + (rows_processed * k), - merge_buffer_dists.data(), - query_batcher.batch_rows() * k, - handle.get_stream()); + raft::copy_async( + output_indices + (rows_processed * k), merge_buffer_indices.data(), + query_batcher.batch_rows() * k, handle.get_stream()); + raft::copy_async( + output_dists + (rows_processed * k), merge_buffer_dists.data(), + query_batcher.batch_rows() * k, handle.get_stream()); rows_processed += query_batcher.batch_rows(); } } private: - void merge_batches(csr_batcher_t& idx_batcher, - csr_batcher_t& query_batcher, - value_t* merge_buffer_dists, - value_idx* merge_buffer_indices, - value_t* out_dists, - value_idx* out_indices) - { + void merge_batches(csr_batcher_t &idx_batcher, + csr_batcher_t &query_batcher, + value_t *merge_buffer_dists, + value_idx *merge_buffer_indices, value_t *out_dists, + value_idx *out_indices) { // build translation buffer to shift resulting indices by the batch std::vector id_ranges; id_ranges.push_back(0); id_ranges.push_back(idx_batcher.batch_start()); rmm::device_uvector trans(id_ranges.size(), handle.get_stream()); - raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), handle.get_stream()); + raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), + handle.get_stream()); // combine merge buffers only if there's more than 1 partition to combine - raft::spatial::knn::knn_merge_parts(merge_buffer_dists, - merge_buffer_indices, - out_dists, - out_indices, - query_batcher.batch_rows(), - 2, - k, - handle.get_stream(), - trans.data()); + raft::spatial::knn::knn_merge_parts( + merge_buffer_dists, merge_buffer_indices, out_dists, out_indices, + query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data()); } void perform_k_selection(csr_batcher_t idx_batcher, csr_batcher_t query_batcher, - value_t* batch_dists, - value_idx* batch_indices, - value_t* out_dists, - value_idx* out_indices) - { + value_t *batch_dists, value_idx *batch_indices, + value_t *out_dists, value_idx *out_indices) { // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), + batch_cols = idx_batcher.batch_rows(); // build translation buffer to shift resulting indices by the batch std::vector id_ranges; @@ -371,60 +340,51 @@ class sparse_knn_t { if (metric == raft::distance::DistanceType::InnerProduct) ascending = false; // kernel to slice first (min) k cols and copy into batched merge buffer - select_k(batch_dists, - batch_indices, - batch_rows, - batch_cols, - out_dists, - out_indices, - ascending, - n_neighbors, - handle.get_stream()); + select_k(batch_dists, batch_indices, batch_rows, batch_cols, out_dists, + out_indices, ascending, n_neighbors, handle.get_stream()); } - void compute_distances(csr_batcher_t& idx_batcher, - csr_batcher_t& query_batcher, - size_t idx_batch_nnz, - size_t query_batch_nnz, - value_idx* idx_batch_indptr, - value_idx* idx_batch_indices, - value_t* idx_batch_data, - value_idx* query_batch_indptr, - value_idx* query_batch_indices, - value_t* query_batch_data, - value_t* batch_dists) - { + void compute_distances(csr_batcher_t &idx_batcher, + csr_batcher_t &query_batcher, + size_t idx_batch_nnz, size_t query_batch_nnz, + value_idx *idx_batch_indptr, + value_idx *idx_batch_indices, value_t *idx_batch_data, + value_idx *query_batch_indptr, + value_idx *query_batch_indices, + value_t *query_batch_data, value_t *batch_dists) { /** * Compute distances */ - raft::sparse::distance::distances_config_t dist_config(handle); + raft::sparse::distance::distances_config_t dist_config( + handle); dist_config.b_nrows = idx_batcher.batch_rows(); dist_config.b_ncols = n_idx_cols; - dist_config.b_nnz = idx_batch_nnz; + dist_config.b_nnz = idx_batch_nnz; - dist_config.b_indptr = idx_batch_indptr; + dist_config.b_indptr = idx_batch_indptr; dist_config.b_indices = idx_batch_indices; - dist_config.b_data = idx_batch_data; + dist_config.b_data = idx_batch_data; dist_config.a_nrows = query_batcher.batch_rows(); dist_config.a_ncols = n_query_cols; - dist_config.a_nnz = query_batch_nnz; + dist_config.a_nnz = query_batch_nnz; - dist_config.a_indptr = query_batch_indptr; + dist_config.a_indptr = query_batch_indptr; dist_config.a_indices = query_batch_indices; - dist_config.a_data = query_batch_data; + dist_config.a_data = query_batch_data; if (raft::sparse::distance::supportedDistance.find(metric) == raft::sparse::distance::supportedDistance.end()) THROW("DistanceType not supported: %d", metric); - raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg); + raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, + metricArg); } const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices; - value_idx* output_indices; + value_idx *output_indices; const value_t *idxData, *queryData; - value_t* output_dists; + value_t *output_dists; size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query; @@ -434,76 +394,52 @@ class sparse_knn_t { int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k; - const raft::handle_t& handle; + const raft::handle_t &handle; }; /** - * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors - * using some distance implementation - * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) - * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) - * @param[in] idxData csr data array of the index matrix (size idxNNZ) - * @param[in] idxNNA number of non-zeros for sparse index matrix - * @param[in] n_idx_rows number of data samples in index matrix - * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) - * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) - * @param[in] queryData csr data array of the query matrix (size queryNNZ) - * @param[in] queryNNZ number of non-zeros for sparse query matrix - * @param[in] n_query_rows number of data samples in query matrix - * @param[in] n_query_cols number of features in query matrix - * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) - * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) - * @param[in] k the number of neighbors to query - * @param[in] cusparseHandle the initialized cusparseHandle instance to use - * @param[in] allocator device allocator instance to use - * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to - * @param[in] batch_size_index maximum number of rows to use from index matrix per batch - * @param[in] batch_size_query maximum number of rows to use from query matrix per batch - * @param[in] metric distance metric/measure to use - * @param[in] metricArg potential argument for metric (currently unused) - */ + * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors + * using some distance implementation + * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) + * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) + * @param[in] idxData csr data array of the index matrix (size idxNNZ) + * @param[in] idxNNA number of non-zeros for sparse index matrix + * @param[in] n_idx_rows number of data samples in index matrix + * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) + * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) + * @param[in] queryData csr data array of the query matrix (size queryNNZ) + * @param[in] queryNNZ number of non-zeros for sparse query matrix + * @param[in] n_query_rows number of data samples in query matrix + * @param[in] n_query_cols number of features in query matrix + * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) + * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) + * @param[in] k the number of neighbors to query + * @param[in] cusparseHandle the initialized cusparseHandle instance to use + * @param[in] allocator device allocator instance to use + * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to + * @param[in] batch_size_index maximum number of rows to use from index matrix per batch + * @param[in] batch_size_query maximum number of rows to use from query matrix per batch + * @param[in] metric distance metric/measure to use + * @param[in] metricArg potential argument for metric (currently unused) + */ template -void brute_force_knn(const value_idx* idxIndptr, - const value_idx* idxIndices, - const value_t* idxData, - size_t idxNNZ, - int n_idx_rows, - int n_idx_cols, - const value_idx* queryIndptr, - const value_idx* queryIndices, - const value_t* queryData, - size_t queryNNZ, - int n_query_rows, - int n_query_cols, - value_idx* output_indices, - value_t* output_dists, - int k, - const raft::handle_t& handle, - size_t batch_size_index = 2 << 14, // approx 1M - size_t batch_size_query = 2 << 14, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, - float metricArg = 0) -{ - sparse_knn_t(idxIndptr, - idxIndices, - idxData, - idxNNZ, - n_idx_rows, - n_idx_cols, - queryIndptr, - queryIndices, - queryData, - queryNNZ, - n_query_rows, - n_query_cols, - output_indices, - output_dists, - k, - handle, - batch_size_index, - batch_size_query, - metric, - metricArg) +void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices, + const value_t *idxData, size_t idxNNZ, int n_idx_rows, + int n_idx_cols, const value_idx *queryIndptr, + const value_idx *queryIndices, const value_t *queryData, + size_t queryNNZ, int n_query_rows, int n_query_cols, + value_idx *output_indices, value_t *output_dists, int k, + const raft::handle_t &handle, + size_t batch_size_index = 2 << 14, // approx 1M + size_t batch_size_query = 2 << 14, + raft::distance::DistanceType metric = + raft::distance::DistanceType::L2Expanded, + float metricArg = 0) { + sparse_knn_t( + idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr, + queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols, + output_indices, output_dists, k, handle, batch_size_index, batch_size_query, + metric, metricArg) .run(); } diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 1308f5ce02..1cf225087a 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -45,34 +45,31 @@ namespace selection { * @param m */ template -__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz) -{ +__global__ void fill_indices(value_idx *indices, size_t m, size_t nnz) { value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nnz) return; - value_idx v = tid / m; + value_idx v = tid / m; indices[tid] = v; } template -value_idx build_k(value_idx n_samples, int c) -{ +value_idx build_k(value_idx n_samples, int c) { // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering // approach on GPU" - return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); + return min(n_samples, + max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); } template -__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz) -{ +__global__ void conv_indices_kernel(in_t *inds, out_t *out, size_t nnz) { size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; - out_t v = inds[tid]; + out_t v = inds[tid]; out[tid] = v; } template -void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream) -{ +void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { size_t blocks = ceildiv(size, (size_t)tpb); conv_indices_kernel<<>>(inds, out, size); } @@ -94,18 +91,13 @@ void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream) * @param c */ template -void knn_graph(const handle_t& handle, - const value_t* X, - size_t m, - size_t n, +void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, distance::DistanceType metric, - raft::sparse::COO& out, - int c = 15) -{ + raft::sparse::COO &out, int c = 15) { int k = build_k(m, c); auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); size_t nnz = m * k; @@ -116,8 +108,8 @@ void knn_graph(const handle_t& handle, size_t blocks = ceildiv(nnz, (size_t)256); fill_indices<<>>(rows.data(), k, nnz); - std::vector inputs; - inputs.push_back(const_cast(X)); + std::vector inputs; + inputs.push_back(const_cast(X)); std::vector sizes; sizes.push_back(m); @@ -127,25 +119,15 @@ void knn_graph(const handle_t& handle, rmm::device_uvector int64_indices(nnz, stream); uint32_t knn_start = curTimeMillis(); - raft::spatial::knn::brute_force_knn(handle, - inputs, - sizes, - n, - const_cast(X), - m, - int64_indices.data(), - data.data(), - k, - true, - true, - nullptr, - metric); + raft::spatial::knn::brute_force_knn( + handle, inputs, sizes, n, const_cast(X), m, int64_indices.data(), + data.data(), k, true, true, nullptr, metric); // convert from current knn's 64-bit to 32-bit. conv_indices(int64_indices.data(), indices.data(), nnz, stream); - raft::sparse::linalg::symmetrize( - handle, rows.data(), indices.data(), data.data(), m, k, nnz, out); + raft::sparse::linalg::symmetrize(handle, rows.data(), indices.data(), + data.data(), m, k, nnz, out); } }; // namespace selection diff --git a/cpp/include/raft/sparse/selection/selection.cuh b/cpp/include/raft/sparse/selection/selection.cuh index 190e06b2cd..6066a36289 100644 --- a/cpp/include/raft/sparse/selection/selection.cuh +++ b/cpp/include/raft/sparse/selection/selection.cuh @@ -39,33 +39,27 @@ namespace raft { namespace sparse { namespace selection { -template -__global__ void select_k_kernel(K* inK, - IndexType* inV, - size_t n_rows, - size_t n_cols, - K* outK, - IndexType* outV, - K initK, - IndexType initV, - int k) -{ +template +__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, + size_t n_cols, K *outK, IndexType *outV, + K initK, IndexType initV, int k) { constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ K smemK[kNumWarps * warp_q]; __shared__ IndexType smemV[kNumWarps * warp_q]; - faiss::gpu:: - BlockSelect, warp_q, thread_q, tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu::BlockSelect, + warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available int row = blockIdx.x; - int i = threadIdx.x; + int i = threadIdx.x; - int idx = row * n_cols; - K* inKStart = inK + idx + i; - IndexType* inVStart = inV + idx + i; + int idx = row * n_cols; + K *inKStart = inK + idx + i; + IndexType *inVStart = inV + idx + i; // Whole warps must participate in the selection int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize); @@ -92,31 +86,27 @@ __global__ void select_k_kernel(K* inK, } } -template -inline void select_k_impl(value_t* inK, - value_idx* inV, - size_t n_rows, - size_t n_cols, - value_t* outK, - value_idx* outV, - bool select_min, - int k, - cudaStream_t stream) -{ +template +inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, + size_t n_cols, value_t *outK, value_idx *outV, + bool select_min, int k, cudaStream_t stream) { auto grid = dim3(n_rows); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); - auto kInit = - select_min ? faiss::gpu::Limits::getMax() : faiss::gpu::Limits::getMin(); + auto kInit = select_min ? faiss::gpu::Limits::getMax() + : faiss::gpu::Limits::getMin(); auto vInit = -1; if (select_min) { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, + vInit, k); } else { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, + vInit, k); } CUDA_CHECK(cudaGetLastError()); } @@ -136,37 +126,30 @@ inline void select_k_impl(value_t* inK, * @param[in] stream CUDA stream to use */ template -inline void select_k(value_t* inK, - value_idx* inV, - size_t n_rows, - size_t n_cols, - value_t* outK, - value_idx* outV, - bool select_min, - int k, - cudaStream_t stream) -{ +inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, + value_t *outK, value_idx *outV, bool select_min, int k, + cudaStream_t stream) { if (k == 1) - select_k_impl( - inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); else if (k <= 32) - select_k_impl( - inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); else if (k <= 64) - select_k_impl( - inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); else if (k <= 128) - select_k_impl( - inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); else if (k <= 256) - select_k_impl( - inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); else if (k <= 512) - select_k_impl( - inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); else if (k <= 1024) - select_k_impl( - inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); + select_k_impl(inK, inV, n_rows, n_cols, outK, + outV, select_min, k, stream); } }; // namespace selection diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h index 56e8832e0a..63578bf1f3 100644 --- a/cpp/include/raft/sparse/utils.h +++ b/cpp/include/raft/sparse/utils.h @@ -26,8 +26,7 @@ namespace sparse { * @param[in] ncols number of blocks to quantize */ template -inline int block_dim(value_idx ncols) -{ +inline int block_dim(value_idx ncols) { int blockdim; if (ncols <= 32) blockdim = 32; @@ -55,9 +54,9 @@ inline int block_dim(value_idx ncols) * @return */ template -__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G key) -{ - unsigned int mask = __ballot_sync(init_mask, true); +__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, + G key) { + unsigned int mask = __ballot_sync(init_mask, true); unsigned int peer_group = 0; bool is_peer; @@ -78,14 +77,12 @@ __device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G ke } #endif -__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) -{ +__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) { return __ffs(peer_group) - 1; } template -__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols) -{ +__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { int row = blockIdx.x; int tid = threadIdx.x; @@ -95,16 +92,15 @@ __global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols) } template -void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_t stream) -{ +void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols, + cudaStream_t stream) { int blockdim = block_dim(ncols); iota_fill_block_kernel<<>>(indices, ncols); } template -__device__ int get_stop_idx(T row, T m, T nnz, const T* ind) -{ +__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) { int stop_idx = 0; if (row < (m - 1)) stop_idx = ind[row + 1]; diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index f77a56164d..77d7831b4a 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -45,16 +45,14 @@ using deviceAllocator = raft::mr::device::allocator; * @param[in] D the dimensionality of the index array */ template -inline void approx_knn_build_index(raft::handle_t& handle, - raft::spatial::knn::knnIndex* index, - knnIndexParam* params, +inline void approx_knn_build_index(raft::handle_t &handle, + raft::spatial::knn::knnIndex *index, + knnIndexParam *params, raft::distance::DistanceType metric, - float metricArg, - float* index_array, - value_idx n, - value_idx D) -{ - detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D); + float metricArg, float *index_array, + value_idx n, value_idx D) { + detail::approx_knn_build_index(handle, index, params, metric, metricArg, + index_array, n, D); } /** @@ -71,15 +69,12 @@ inline void approx_knn_build_index(raft::handle_t& handle, * @param[in] n number of rows in the query array */ template -inline void approx_knn_search(raft::handle_t& handle, - float* distances, - int64_t* indices, - raft::spatial::knn::knnIndex* index, - value_idx k, - float* query_array, - value_idx n) -{ - detail::approx_knn_search(handle, distances, indices, index, k, query_array, n); +inline void approx_knn_search(raft::handle_t &handle, float *distances, + int64_t *indices, + raft::spatial::knn::knnIndex *index, value_idx k, + float *query_array, value_idx n) { + detail::approx_knn_search(handle, distances, indices, index, k, query_array, + n); } } // namespace knn diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index 573a23181d..6a6c7751c2 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -26,14 +26,13 @@ namespace spatial { namespace knn { struct knnIndex { - faiss::gpu::GpuIndex* index; + faiss::gpu::GpuIndex *index; raft::distance::DistanceType metric; float metricArg; - faiss::gpu::StandardGpuResources* gpu_res; + faiss::gpu::StandardGpuResources *gpu_res; int device; - ~knnIndex() - { + ~knnIndex() { delete index; delete gpu_res; } @@ -58,8 +57,7 @@ struct IVFParam : knnIndexParam { int nprobe; }; -struct IVFFlatParam : IVFParam { -}; +struct IVFFlatParam : IVFParam {}; struct IVFPQParam : IVFParam { int M; diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 7eb439c78b..6e4c99b646 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -56,107 +56,115 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype) -{ +inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype( + QuantizerType qtype) { switch (qtype) { - case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit; + case QuantizerType::QT_8bit: + return faiss::ScalarQuantizer::QuantizerType::QT_8bit; case QuantizerType::QT_8bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform; case QuantizerType::QT_4bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform; - case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16; + case QuantizerType::QT_fp16: + return faiss::ScalarQuantizer::QuantizerType::QT_fp16; case QuantizerType::QT_8bit_direct: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct; - case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit; - default: return (faiss::ScalarQuantizer::QuantizerType)qtype; + case QuantizerType::QT_6bit: + return faiss::ScalarQuantizer::QuantizerType::QT_6bit; + default: + return (faiss::ScalarQuantizer::QuantizerType)qtype; } } template -void approx_knn_ivfflat_build_index( - knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D) -{ +void approx_knn_ivfflat_build_index(knnIndex *index, IVFParam *params, + raft::distance::DistanceType metric, + IntType n, IntType D) { faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = index->device; + config.device = index->device; faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFFlat* faiss_index = - new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config); + faiss::gpu::GpuIndexIVFFlat *faiss_index = new faiss::gpu::GpuIndexIVFFlat( + index->gpu_res, D, params->nlist, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfpq_build_index( - knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) -{ +void approx_knn_ivfpq_build_index(knnIndex *index, IVFPQParam *params, + raft::distance::DistanceType metric, + IntType n, IntType D) { faiss::gpu::GpuIndexIVFPQConfig config; - config.device = index->device; - config.usePrecomputedTables = params->usePrecomputedTables; - config.interleavedLayout = params->n_bits != 8; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ( - index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config); + config.device = index->device; + config.usePrecomputedTables = params->usePrecomputedTables; + config.interleavedLayout = params->n_bits != 8; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::gpu::GpuIndexIVFPQ *faiss_index = + new faiss::gpu::GpuIndexIVFPQ(index->gpu_res, D, params->nlist, params->M, + params->n_bits, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfsq_build_index( - knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) -{ +void approx_knn_ivfsq_build_index(knnIndex *index, IVFSQParam *params, + raft::distance::DistanceType metric, + IntType n, IntType D) { faiss::gpu::GpuIndexIVFScalarQuantizerConfig config; - config.device = index->device; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params->qtype); - faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer( - index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual); + config.device = index->device; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::ScalarQuantizer::QuantizerType faiss_qtype = + build_faiss_qtype(params->qtype); + faiss::gpu::GpuIndexIVFScalarQuantizer *faiss_index = + new faiss::gpu::GpuIndexIVFScalarQuantizer(index->gpu_res, D, params->nlist, + faiss_qtype, faiss_metric, + params->encodeResidual); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_build_index(raft::handle_t& handle, - raft::spatial::knn::knnIndex* index, - raft::spatial::knn::knnIndexParam* params, +void approx_knn_build_index(raft::handle_t &handle, + raft::spatial::knn::knnIndex *index, + raft::spatial::knn::knnIndexParam *params, raft::distance::DistanceType metric, - float metricArg, - float* index_array, - IntType n, - IntType D) -{ + float metricArg, float *index_array, IntType n, + IntType D) { int device; CUDA_CHECK(cudaGetDevice(&device)); - faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources(); + faiss::gpu::StandardGpuResources *gpu_res = + new faiss::gpu::StandardGpuResources(); gpu_res->noTempMemory(); gpu_res->setDefaultStream(device, handle.get_stream()); - index->gpu_res = gpu_res; - index->device = device; - index->index = nullptr; - index->metric = metric; + index->gpu_res = gpu_res; + index->device = device; + index->index = nullptr; + index->metric = metric; index->metricArg = metricArg; // perform preprocessing // k set to 0 (unused during preprocessing / revertion) - std::unique_ptr> query_metric_processor = create_processor( - metric, n, D, 0, false, handle.get_stream(), handle.get_device_allocator()); + std::unique_ptr> query_metric_processor = + create_processor(metric, n, D, 0, false, handle.get_stream(), + handle.get_device_allocator()); query_metric_processor->preprocess(index_array); - if (dynamic_cast(params)) { - IVFFlatParam* IVFFlat_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFFlatParam *IVFFlat_param = dynamic_cast(params); approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D); std::vector h_index_array(n * D); - raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream()); + raft::update_host(h_index_array.data(), index_array, h_index_array.size(), + handle.get_stream()); query_metric_processor->revert(index_array); index->index->train(n, h_index_array.data()); index->index->add(n, h_index_array.data()); } else { - if (dynamic_cast(params)) { - IVFPQParam* IVFPQ_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFPQParam *IVFPQ_param = dynamic_cast(params); approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D); - } else if (dynamic_cast(params)) { - IVFSQParam* IVFSQ_param = dynamic_cast(params); + } else if (dynamic_cast(params)) { + IVFSQParam *IVFSQ_param = dynamic_cast(params); approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D); } else { ASSERT(index->index, "KNN index could not be initialized"); @@ -169,23 +177,13 @@ void approx_knn_build_index(raft::handle_t& handle, } template -void approx_knn_search(raft::handle_t& handle, - float* distances, - int64_t* indices, - raft::spatial::knn::knnIndex* index, - IntType k, - float* query_array, - IntType n) -{ +void approx_knn_search(raft::handle_t &handle, float *distances, + int64_t *indices, raft::spatial::knn::knnIndex *index, + IntType k, float *query_array, IntType n) { // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(index->metric, - n, - index->index->d, - k, - false, - handle.get_stream(), - handle.get_device_allocator()); + create_processor(index->metric, n, index->index->d, k, false, + handle.get_stream(), handle.get_device_allocator()); query_metric_processor->preprocess(query_array); index->index->search(n, query_array, k, distances, indices); @@ -196,14 +194,13 @@ void approx_knn_search(raft::handle_t& handle, index->metric == raft::distance::DistanceType::L2SqrtUnexpanded || index->metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg; + if (index->metric == raft::distance::DistanceType::LpUnexpanded) + p = 1.0 / index->metricArg; raft::linalg::unaryOp( - distances, - distances, - n * k, + distances, distances, n * k, [p] __device__(float input) { return powf(input, p); }, handle.get_stream()); } diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h index 5618186dfc..0c0398a336 100644 --- a/cpp/include/raft/spatial/knn/detail/common_faiss.h +++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h @@ -27,26 +27,37 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::MetricType build_faiss_metric(raft::distance::DistanceType metric) -{ +inline faiss::MetricType build_faiss_metric( + raft::distance::DistanceType metric) { switch (metric) { case raft::distance::DistanceType::CosineExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; case raft::distance::DistanceType::CorrelationExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::L2Expanded: return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtExpanded: return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtUnexpanded: return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L1: return faiss::MetricType::METRIC_L1; - case raft::distance::DistanceType::InnerProduct: return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp; - case raft::distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf; - case raft::distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra; - case raft::distance::DistanceType::BrayCurtis: return faiss::MetricType::METRIC_BrayCurtis; + case raft::distance::DistanceType::L2Expanded: + return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2Unexpanded: + return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtExpanded: + return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtUnexpanded: + return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L1: + return faiss::MetricType::METRIC_L1; + case raft::distance::DistanceType::InnerProduct: + return faiss::MetricType::METRIC_INNER_PRODUCT; + case raft::distance::DistanceType::LpUnexpanded: + return faiss::MetricType::METRIC_Lp; + case raft::distance::DistanceType::Linf: + return faiss::MetricType::METRIC_Linf; + case raft::distance::DistanceType::Canberra: + return faiss::MetricType::METRIC_Canberra; + case raft::distance::DistanceType::BrayCurtis: + return faiss::MetricType::METRIC_BrayCurtis; case raft::distance::DistanceType::JensenShannon: return faiss::MetricType::METRIC_JensenShannon; - default: THROW("MetricType not supported: %d", metric); + default: + THROW("MetricType not supported: %d", metric); } } diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 049c11514c..7d87254cb6 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -35,8 +35,7 @@ namespace knn { namespace detail { template -DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) -{ +DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { value_t sin_0 = sin(0.5 * (x1 - y1)); value_t sin_1 = sin(0.5 * (x2 - y2)); value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1; @@ -57,36 +56,34 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) * @param[in] n_index_rows number of rows in index array * @param[in] k number of closest neighbors to return */ -template -__global__ void haversine_knn_kernel(value_idx* out_inds, - value_t* out_dists, - const value_t* index, - const value_t* query, - size_t n_index_rows, - int k) -{ +template +__global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, + const value_t *index, const value_t *query, + size_t n_index_rows, int k) { constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; __shared__ value_idx smemV[kNumWarps * warp_q]; - faiss::gpu:: - BlockSelect, warp_q, thread_q, tpb> - heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); + faiss::gpu::BlockSelect, warp_q, thread_q, + tpb> + heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); // Grid is exactly sized to rows available int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize); - const value_t* query_ptr = query + (blockIdx.x * 2); - value_t x1 = query_ptr[0]; - value_t x2 = query_ptr[1]; + const value_t *query_ptr = query + (blockIdx.x * 2); + value_t x1 = query_ptr[0]; + value_t x2 = query_ptr[1]; int i = threadIdx.x; for (; i < limit; i += tpb) { - const value_t* idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t *idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -95,9 +92,9 @@ __global__ void haversine_knn_kernel(value_idx* out_inds, // Handle last remainder fraction of a warp of elements if (i < n_index_rows) { - const value_t* idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t *idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -108,7 +105,7 @@ __global__ void haversine_knn_kernel(value_idx* out_inds, for (int i = threadIdx.x; i < k; i += tpb) { out_dists[blockIdx.x * k + i] = smemK[i]; - out_inds[blockIdx.x * k + i] = smemV[i]; + out_inds[blockIdx.x * k + i] = smemV[i]; } } @@ -129,15 +126,10 @@ __global__ void haversine_knn_kernel(value_idx* out_inds, * @param[in] stream stream to order kernel launch */ template -void haversine_knn(value_idx* out_inds, - value_t* out_dists, - const value_t* index, - const value_t* query, - size_t n_index_rows, - size_t n_query_rows, - int k, - cudaStream_t stream) -{ +void haversine_knn(value_idx *out_inds, value_t *out_dists, + const value_t *index, const value_t *query, + size_t n_index_rows, size_t n_query_rows, int k, + cudaStream_t stream) { haversine_knn_kernel<<>>( out_inds, out_dists, index, query, n_index_rows, k); } diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index a276ae45ad..09494e9eb1 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -43,18 +43,13 @@ namespace spatial { namespace knn { namespace detail { -template -__global__ void knn_merge_parts_kernel(value_t* inK, - value_idx* inV, - value_t* outK, - value_idx* outV, - size_t n_samples, - int n_parts, - value_t initK, - value_idx initV, - int k, - value_idx* translations) -{ +template +__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, + value_t *outK, value_idx *outV, + size_t n_samples, int n_parts, + value_t initK, value_idx initV, int k, + value_idx *translations) { constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; @@ -63,33 +58,34 @@ __global__ void knn_merge_parts_kernel(value_t* inK, /** * Uses shared memory */ - faiss::gpu:: - BlockSelect, warp_q, thread_q, tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu::BlockSelect, warp_q, thread_q, + tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available - int row = blockIdx.x; + int row = blockIdx.x; int total_k = k * n_parts; int i = threadIdx.x; // Get starting pointers for cols in current thread - int part = i / k; + int part = i / k; size_t row_idx = (row * k) + (part * n_samples * k); int col = i % k; - value_t* inKStart = inK + (row_idx + col); - value_idx* inVStart = inV + (row_idx + col); + value_t *inKStart = inK + (row_idx + col); + value_idx *inVStart = inV + (row_idx + col); - int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); + int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); value_idx translation = 0; for (; i < limit; i += tpb) { translation = translations[part]; heap.add(*inKStart, (*inVStart) + translation); - part = (i + tpb) / k; + part = (i + tpb) / k; row_idx = (row * k) + (part * n_samples * k); col = (i + tpb) % k; @@ -112,27 +108,22 @@ __global__ void knn_merge_parts_kernel(value_t* inK, } } -template -inline void knn_merge_parts_impl(value_t* inK, - value_idx* inV, - value_t* outK, - value_idx* outV, - size_t n_samples, - int n_parts, - int k, - cudaStream_t stream, - value_idx* translations) -{ +template +inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, + value_idx *outV, size_t n_samples, int n_parts, + int k, cudaStream_t stream, + value_idx *translations) { auto grid = dim3(n_samples); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); auto kInit = faiss::gpu::Limits::getMax(); auto vInit = -1; knn_merge_parts_kernel - <<>>( - inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations); + <<>>(inK, inV, outK, outV, n_samples, n_parts, + kInit, vInit, k, translations); CUDA_CHECK(cudaPeekAtLastError()); } @@ -151,16 +142,10 @@ inline void knn_merge_parts_impl(value_t* inK, * @param translations mapping of index offsets for each partition */ template -inline void knn_merge_parts(value_t* inK, - value_idx* inV, - value_t* outK, - value_idx* outV, - size_t n_samples, - int n_parts, - int k, - cudaStream_t stream, - value_idx* translations) -{ +inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, + value_idx *outV, size_t n_samples, int n_parts, + int k, cudaStream_t stream, + value_idx *translations) { if (k == 1) knn_merge_parts_impl( inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); @@ -210,33 +195,27 @@ inline void knn_merge_parts(value_t* inK, * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm */ template -void brute_force_knn_impl( - std::vector& input, - std::vector& sizes, - IntType D, - float* search_items, - IntType n, - int64_t* res_I, - float* res_D, - IntType k, - std::shared_ptr allocator, - cudaStream_t userStream, - cudaStream_t* internalStreams = nullptr, - int n_int_streams = 0, - bool rowMajorIndex = true, - bool rowMajorQuery = true, - std::vector* translations = nullptr, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, - float metricArg = 0) -{ - ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size"); - - std::vector* id_ranges; +void brute_force_knn_impl(std::vector &input, std::vector &sizes, + IntType D, float *search_items, IntType n, + int64_t *res_I, float *res_D, IntType k, + std::shared_ptr allocator, + cudaStream_t userStream, + cudaStream_t *internalStreams = nullptr, + int n_int_streams = 0, bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector *translations = nullptr, + raft::distance::DistanceType metric = + raft::distance::DistanceType::L2Expanded, + float metricArg = 0) { + ASSERT(input.size() == sizes.size(), + "input and sizes vectors should be the same size"); + + std::vector *id_ranges; if (translations == nullptr) { // If we don't have explicit translations // for offsets of the indices, build them // from the local partitions - id_ranges = new std::vector(); + id_ranges = new std::vector(); int64_t total_n = 0; for (size_t i = 0; i < input.size(); i++) { id_ranges->push_back(total_n); @@ -249,27 +228,31 @@ void brute_force_knn_impl( // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(metric, n, D, k, rowMajorQuery, userStream, allocator); + create_processor(metric, n, D, k, rowMajorQuery, userStream, + allocator); query_metric_processor->preprocess(search_items); - std::vector>> metric_processors(input.size()); + std::vector>> metric_processors( + input.size()); for (size_t i = 0; i < input.size(); i++) { - metric_processors[i] = - create_processor(metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); + metric_processors[i] = create_processor( + metric, sizes[i], D, k, rowMajorQuery, userStream, allocator); metric_processors[i]->preprocess(input[i]); } int device; CUDA_CHECK(cudaGetDevice(&device)); - raft::mr::device::buffer trans(allocator, userStream, id_ranges->size()); - raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); + raft::mr::device::buffer trans(allocator, userStream, + id_ranges->size()); + raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), + userStream); raft::mr::device::buffer all_D(allocator, userStream, 0); raft::mr::device::buffer all_I(allocator, userStream, 0); - float* out_D = res_D; - int64_t* out_I = res_I; + float *out_D = res_D; + int64_t *out_I = res_I; if (input.size() > 1) { all_D.resize(input.size() * k * n, userStream); @@ -283,10 +266,11 @@ void brute_force_knn_impl( if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); for (size_t i = 0; i < input.size(); i++) { - float* out_d_ptr = out_D + (i * k * n); - int64_t* out_i_ptr = out_I + (i * k * n); + float *out_d_ptr = out_D + (i * k * n); + int64_t *out_i_ptr = out_I + (i * k * n); - cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i); + cudaStream_t stream = + raft::select_stream(userStream, internalStreams, n_int_streams, i); switch (metric) { case raft::distance::DistanceType::Haversine: @@ -295,7 +279,8 @@ void brute_force_knn_impl( "Haversine distance requires 2 dimensions " "(latitude / longitude)."); - haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream); + haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, + k, stream); break; default: faiss::MetricType m = build_faiss_metric(metric); @@ -306,18 +291,18 @@ void brute_force_knn_impl( gpu_res.setDefaultStream(device, stream); faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = metricArg; - args.k = k; - args.dims = D; - args.vectors = input[i]; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; args.vectorsRowMajor = rowMajorIndex; - args.numVectors = sizes[i]; - args.queries = search_items; + args.numVectors = sizes[i]; + args.queries = search_items; args.queriesRowMajor = rowMajorQuery; - args.numQueries = n; - args.outDistances = out_d_ptr; - args.outIndices = out_i_ptr; + args.numQueries = n; + args.outDistances = out_d_ptr; + args.outIndices = out_i_ptr; /** * @todo: Until FAISS supports pluggable allocation strategies, @@ -340,7 +325,8 @@ void brute_force_knn_impl( if (input.size() > 1 || translations != nullptr) { // This is necessary for proper index translations. If there are // no translations or partitions to combine, it can be skipped. - knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data()); + knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, + trans.data()); } // Perform necessary post-processing @@ -348,12 +334,14 @@ void brute_force_knn_impl( metric == raft::distance::DistanceType::L2SqrtUnexpanded || metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg; + if (metric == raft::distance::DistanceType::LpUnexpanded) + p = 1.0 / metricArg; raft::linalg::unaryOp( - res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream); + res_D, res_D, n * k, + [p] __device__(float input) { return powf(input, p); }, userStream); } query_metric_processor->revert(search_items); diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index 6e983d1f42..a645412c2f 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -39,11 +39,11 @@ using deviceAllocator = raft::mr::device::allocator; template class MetricProcessor { public: - virtual void preprocess(math_t* data) {} + virtual void preprocess(math_t *data) {} - virtual void revert(math_t* data) {} + virtual void revert(math_t *data) {} - virtual void postprocess(math_t* data) {} + virtual void postprocess(math_t *data) {} virtual ~MetricProcessor() = default; }; @@ -60,10 +60,7 @@ class CosineMetricProcessor : public MetricProcessor { raft::mr::device::buffer colsums_; public: - CosineMetricProcessor(size_t n_rows, - size_t n_cols, - int k, - bool row_major, + CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream, std::shared_ptr allocator) : device_allocator_(allocator), @@ -72,51 +69,30 @@ class CosineMetricProcessor : public MetricProcessor { n_cols_(n_cols), n_rows_(n_rows), row_major_(row_major), - k_(k) - { - } + k_(k) {} - void preprocess(math_t* data) - { - raft::linalg::rowNorm(colsums_.data(), - data, - n_cols_, - n_rows_, - raft::linalg::NormType::L2Norm, - row_major_, - stream_, + void preprocess(math_t *data) { + raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_, + raft::linalg::NormType::L2Norm, row_major_, stream_, [] __device__(math_t in) { return sqrtf(in); }); raft::linalg::matrixVectorOp( - data, - data, - colsums_.data(), - n_cols_, - n_rows_, - row_major_, - false, + data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; }, stream_); } - void revert(math_t* data) - { + void revert(math_t *data) { raft::linalg::matrixVectorOp( - data, - data, - colsums_.data(), - n_cols_, - n_rows_, - row_major_, - false, + data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; }, stream_); } - void postprocess(math_t* data) - { + void postprocess(math_t *data) { raft::linalg::unaryOp( - data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_); + data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, + stream_); } ~CosineMetricProcessor() = default; @@ -127,64 +103,43 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { using cosine = CosineMetricProcessor; public: - CorrelationMetricProcessor(size_t n_rows, - size_t n_cols, - int k, - bool row_major, - cudaStream_t stream, + CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k, + bool row_major, cudaStream_t stream, std::shared_ptr allocator) - : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, allocator), - means_(allocator, stream, n_rows) - { - } + : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream, + allocator), + means_(allocator, stream, n_rows) {} - void preprocess(math_t* data) - { + void preprocess(math_t *data) { math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_; - raft::linalg::reduce(means_.data(), - data, - cosine::n_cols_, - cosine::n_rows_, - (math_t)0.0, - cosine::row_major_, - true, + raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_, + (math_t)0.0, cosine::row_major_, true, cosine::stream_); raft::linalg::unaryOp( - means_.data(), - means_.data(), - cosine::n_rows_, + means_.data(), means_.data(), cosine::n_rows_, [=] __device__(math_t in) { return in * normalizer_const; }, cosine::stream_); - raft::stats::meanCenter(data, - data, - means_.data(), - cosine::n_cols_, - cosine::n_rows_, - cosine::row_major_, - false, + raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_, + cosine::n_rows_, cosine::row_major_, false, cosine::stream_); CosineMetricProcessor::preprocess(data); } - void revert(math_t* data) - { + void revert(math_t *data) { CosineMetricProcessor::revert(data); - raft::stats::meanAdd(data, - data, - means_.data(), - cosine::n_cols_, - cosine::n_rows_, - cosine::row_major_, - false, + raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_, + cosine::n_rows_, cosine::row_major_, false, cosine::stream_); } - void postprocess(math_t* data) { CosineMetricProcessor::postprocess(data); } + void postprocess(math_t *data) { + CosineMetricProcessor::postprocess(data); + } ~CorrelationMetricProcessor() = default; @@ -194,36 +149,33 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { template class DefaultMetricProcessor : public MetricProcessor { public: - void preprocess(math_t* data) {} + void preprocess(math_t *data) {} - void revert(math_t* data) {} + void revert(math_t *data) {} - void postprocess(math_t* data) {} + void postprocess(math_t *data) {} ~DefaultMetricProcessor() = default; }; template inline std::unique_ptr> create_processor( - distance::DistanceType metric, - int n, - int D, - int k, - bool rowMajorQuery, - cudaStream_t userStream, - std::shared_ptr allocator) -{ - MetricProcessor* mp = nullptr; + distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, + cudaStream_t userStream, std::shared_ptr allocator) { + MetricProcessor *mp = nullptr; switch (metric) { case distance::DistanceType::CosineExpanded: - mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); + mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream, + allocator); break; case distance::DistanceType::CorrelationExpanded: - mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, userStream, allocator); + mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, + userStream, allocator); break; - default: mp = new DefaultMetricProcessor(); + default: + mp = new DefaultMetricProcessor(); } return std::unique_ptr>(mp); diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index 42ee11ba5b..a3a1972c13 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -28,17 +28,12 @@ namespace knn { using deviceAllocator = raft::mr::device::allocator; template -inline void knn_merge_parts(value_t* inK, - value_idx* inV, - value_t* outK, - value_idx* outV, - size_t n_samples, - int n_parts, - int k, - cudaStream_t stream, - value_idx* translations) -{ - detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); +inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, + value_idx *outV, size_t n_samples, int n_parts, + int k, cudaStream_t stream, + value_idx *translations) { + detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, + translations); } /** @@ -64,42 +59,23 @@ inline void knn_merge_parts(value_t* inK, * @param[in] expanded should lp-based distances be returned in their expanded * form (e.g., without raising to the 1/p power). */ -inline void brute_force_knn(raft::handle_t const& handle, - std::vector& input, - std::vector& sizes, - int D, - float* search_items, - int n, - int64_t* res_I, - float* res_D, - int k, - bool rowMajorIndex = true, - bool rowMajorQuery = true, - std::vector* translations = nullptr, - distance::DistanceType metric = distance::DistanceType::L2Unexpanded, - float metric_arg = 2.0f) -{ - ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size"); +inline void brute_force_knn( + raft::handle_t const &handle, std::vector &input, + std::vector &sizes, int D, float *search_items, int n, int64_t *res_I, + float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true, + std::vector *translations = nullptr, + distance::DistanceType metric = distance::DistanceType::L2Unexpanded, + float metric_arg = 2.0f) { + ASSERT(input.size() == sizes.size(), + "input and sizes vectors must be the same size"); std::vector int_streams = handle.get_internal_streams(); - detail::brute_force_knn_impl(input, - sizes, - D, - search_items, - n, - res_I, - res_D, - k, - handle.get_device_allocator(), - handle.get_stream(), - int_streams.data(), - handle.get_num_internal_streams(), - rowMajorIndex, - rowMajorQuery, - translations, - metric, - metric_arg); + detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D, + k, handle.get_device_allocator(), + handle.get_stream(), int_streams.data(), + handle.get_num_internal_streams(), rowMajorIndex, + rowMajorQuery, translations, metric, metric_arg); } } // namespace knn diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 7032a0009e..922ae7cfab 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -24,7 +24,8 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct cluster_solver_config_t { size_type_t n_clusters; size_type_t maxIter; @@ -34,37 +35,25 @@ struct cluster_solver_config_t { unsigned long long seed{123456}; }; -template +template struct kmeans_solver_t { - explicit kmeans_solver_t( - cluster_solver_config_t const& config) - : config_(config) - { - } + explicit kmeans_solver_t(cluster_solver_config_t const& config) + : config_(config) {} template - std::pair solve(handle_t const& handle, - thrust_exe_policy_t t_exe_policy, - size_type_t n_obs_vecs, - size_type_t dim, - value_type_t const* __restrict__ obs, - index_type_t* __restrict__ codes) const - { + std::pair solve( + handle_t const& handle, thrust_exe_policy_t t_exe_policy, + size_type_t n_obs_vecs, size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const { RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - kmeans(handle, - t_exe_policy, - n_obs_vecs, - dim, - config_.n_clusters, - config_.tol, - config_.maxIter, - obs, - codes, - residual, - iters, + kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters, + config_.tol, config_.maxIter, obs, codes, residual, iters, config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index 156b996586..e36dca2e0c 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -23,7 +23,8 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct eigen_solver_config_t { size_type_t n_eigVecs; size_type_t maxIter; @@ -33,59 +34,42 @@ struct eigen_solver_config_t { bool reorthogonalize{false}; unsigned long long seed{ - 1234567}; // CAVEAT: this default value is now common to all instances of using seed in - // Lanczos; was not the case before: there were places where a default seed = 123456 - // was used; this may trigger slightly different # solver iterations + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations }; -template +template struct lanczos_solver_t { - explicit lanczos_solver_t( - eigen_solver_config_t const& config) - : config_(config) - { - } + explicit lanczos_solver_t(eigen_solver_config_t const& config) + : config_(config) {} - index_type_t solve_smallest_eigenvectors(handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const - { + index_type_t solve_smallest_eigenvectors( + handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeSmallestEigenvectors(handle, - A, - config_.n_eigVecs, - config_.maxIter, - config_.restartIter, - config_.tol, - config_.reorthogonalize, - iters, - eigVals, - eigVecs, - config_.seed); + computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, + config_.restartIter, config_.tol, + config_.reorthogonalize, iters, eigVals, + eigVecs, config_.seed); return iters; } - index_type_t solve_largest_eigenvectors(handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const - { + index_type_t solve_largest_eigenvectors( + handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeLargestEigenvectors(handle, - A, - config_.n_eigVecs, - config_.maxIter, - config_.restartIter, - config_.tol, - config_.reorthogonalize, - iters, - eigVals, - eigVecs, + computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, + config_.restartIter, config_.tol, + config_.reorthogonalize, iters, eigVals, eigVecs, config_.seed); return iters; } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index e0c3565b77..fb05bff3e2 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -44,15 +44,15 @@ using namespace raft::linalg; // Useful grid settings // ========================================================= -constexpr unsigned int BLOCK_SIZE = 1024; -constexpr unsigned int WARP_SIZE = 32; +constexpr unsigned int BLOCK_SIZE = 1024; +constexpr unsigned int WARP_SIZE = 32; constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // ========================================================= // CUDA kernels // ========================================================= -/** +/** * @brief Compute distances between observation vectors and centroids * Block dimensions should be (warpSize, 1, * blockSize/warpSize). Ideally, the grid is large enough so there @@ -76,13 +76,11 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * initialized to zero. */ template -static __global__ void computeDistances(index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists) -{ +static __global__ void computeDistances( + index_type_t n, index_type_t d, index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) { // Loop index index_type_t i; @@ -117,10 +115,12 @@ static __global__ void computeDistances(index_type_t n, // Perform reduction on warp for (i = WARP_SIZE / 2; i > 0; i /= 2) - dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); + dist_private += + __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); // Write result to global memory - if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private); + if (threadIdx.x == 0) + atomicAdd(dists + IDX(gidz, gidy, n), dist_private); // Move to another observation vector gidz += blockDim.z * gridDim.z; @@ -135,8 +135,8 @@ static __global__ void computeDistances(index_type_t n, } } -/** - * @brief Find closest centroid to observation vectors. +/** + * @brief Find closest centroid to observation vectors. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -157,12 +157,10 @@ static __global__ void computeDistances(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void minDistances(index_type_t n, - index_type_t k, +static __global__ void minDistances(index_type_t n, index_type_t k, value_type_t* __restrict__ dists, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) -{ + index_type_t* __restrict__ clusterSizes) { // Loop index index_type_t i, j; @@ -181,8 +179,8 @@ static __global__ void minDistances(index_type_t n, dist_min = dists[IDX(i, 0, n)]; for (j = 1; j < k; ++j) { dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; } // Transfer result to global memory @@ -197,8 +195,8 @@ static __global__ void minDistances(index_type_t n, } } -/** - * @brief Check if newly computed distances are smaller than old distances. +/** + * @brief Check if newly computed distances are smaller than old distances. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -221,8 +219,7 @@ static __global__ void minDistances2(index_type_t n, value_type_t* __restrict__ dists_old, const value_type_t* __restrict__ dists_new, index_type_t* __restrict__ codes_old, - index_type_t code_new) -{ + index_type_t code_new) { // Loop index index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; @@ -247,7 +244,7 @@ static __global__ void minDistances2(index_type_t n, } } -/** +/** * @brief Compute size of k-means clusters. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. @@ -259,11 +256,9 @@ static __global__ void minDistances2(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void computeClusterSizes(index_type_t n, - index_type_t k, - const index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) -{ +static __global__ void computeClusterSizes( + index_type_t n, index_type_t k, const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) { index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { atomicAdd(clusterSizes + codes[i], 1); @@ -271,8 +266,8 @@ static __global__ void computeClusterSizes(index_type_t n, } } -/** - * @brief Divide rows of centroid matrix by cluster sizes. +/** + * @brief Divide rows of centroid matrix by cluster sizes. * Divides the ith column of the sum matrix by the size of the ith * cluster. If the sum matrix has been initialized so that the ith * row is the sum of all observation vectors in the ith cluster, @@ -293,11 +288,9 @@ static __global__ void computeClusterSizes(index_type_t n, * column is the mean position of a cluster). */ template -static __global__ void divideCentroids(index_type_t d, - index_type_t k, - const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids) -{ +static __global__ void divideCentroids( + index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) { // Global indices index_type_t gidx, gidy; @@ -348,17 +341,15 @@ static __global__ void divideCentroids(index_type_t d, * coordinates. * @return Zero if successful. Otherwise non-zero. */ -template +template static int chooseNewCentroid(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, - index_type_t n, - index_type_t d, - index_type_t k, + index_type_t n, index_type_t d, index_type_t k, value_type_t rand, const value_type_t* __restrict__ obs, value_type_t* __restrict__ dists, - value_type_t* __restrict__ centroid) -{ + value_type_t* __restrict__ centroid) { // Cumulative sum of distances value_type_t* distsCumSum = dists + n; // Residual sum of squares @@ -367,43 +358,43 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t obsIndex; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Compute cumulative sum of distances - thrust::inclusive_scan(thrust_exec_policy, - thrust::device_pointer_cast(dists), + thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpyAsync( - &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); + CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); // Randomly choose observation vector // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) // - // seg-faults due to Thrust bug - // on binary-search-like algorithms - // when run with stream dependent - // execution policies; fixed on Thrust GitHub - // hence replace w/ linear interpolation, - // until the Thrust issue gets resolved: + //seg-faults due to Thrust bug + //on binary-search-like algorithms + //when run with stream dependent + //execution policies; fixed on Thrust GitHub + //hence replace w/ linear interpolation, + //until the Thrust issue gets resolved: // // obsIndex = (thrust::lower_bound( // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - // thrust::device_pointer_cast(distsCumSum)); // - // linear interpolation logic: + //linear interpolation logic: //{ value_type_t minSum{0}; - CUDA_TRY( - cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); + CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); CHECK_CUDA(stream); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); - obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / (distsSum - minSum)); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / + (distsSum - minSum)); } else { obsIndex = 0; } @@ -414,23 +405,21 @@ static int chooseNewCentroid(handle_t const& handle, obsIndex = min(obsIndex, n - 1); // Record new centroid position - CUDA_TRY(cudaMemcpyAsync(centroid, - obs + IDX(0, obsIndex, d), - d * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, + CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), + d * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); return 0; } /** - * @brief Choose initial cluster centroids for k-means algorithm. + * @brief Choose initial cluster centroids for k-means algorithm. * Centroids are randomly chosen with k-means++ algorithm * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. * @tparam thrust_exe_pol_t the type of thrust execution policy. * @param handle the raft handle. - * @param thrust_exec_policy thrust execution policy + * @param thrust_exec_policy thrust execution policy * (assumed to have same stream as handle.stream). * @param n Number of observation vectors. * @param d Dimension of observation vectors. @@ -450,19 +439,14 @@ static int chooseNewCentroid(handle_t const& handle, * distance between observation vectors and the closest centroid. * @return Zero if successful. Otherwise non-zero. */ -template -static int initializeCentroids(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, - index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - value_type_t* __restrict__ centroids, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ dists, - unsigned long long seed) -{ +template +static int initializeCentroids( + handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists, + unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -475,7 +459,7 @@ static int initializeCentroids(handle_t const& handle, thrust::uniform_real_distribution uniformDist(0, 1); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); constexpr index_type_t grid_lower_bound{65535}; @@ -487,43 +471,36 @@ static int initializeCentroids(handle_t const& handle, dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; // CUDA grid dimensions - dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - 1, - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; + dim3 gridDim_warp{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; // CUDA grid dimensions - dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), + 1, 1}; // Assign observation vectors to code 0 CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); // Choose first centroid - thrust::fill(thrust_exec_policy, - thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n), - 1); + thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), 1); CHECK_CUDA(stream); - if (chooseNewCentroid( - handle, thrust_exec_policy, n, d, k, uniformDist(rng), obs, dists, centroids)) + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), + obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, dists); + computeDistances<<>>( + n, d, 1, obs, centroids, dists); CHECK_CUDA(stream); // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, - thrust_exec_policy, - n, - d, - k, - uniformDist(rng), - obs, - dists, - centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng), + obs, dists, centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid @@ -533,20 +510,22 @@ static int initializeCentroids(handle_t const& handle, CHECK_CUDA(stream); // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, codes, i); + minDistances2<<>>(n, dists, dists + n, + codes, i); CHECK_CUDA(stream); } // Compute cluster sizes CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - computeClusterSizes<<>>(n, k, codes, clusterSizes); + computeClusterSizes<<>>(n, k, codes, + clusterSizes); CHECK_CUDA(stream); return 0; } -/** - * @brief Find cluster centroids closest to observation vectors. +/** + * @brief Find cluster centroids closest to observation vectors. * Distance is measured with Euclidean norm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -574,21 +553,16 @@ static int initializeCentroids(handle_t const& handle, * of squares of assignment. * @return Zero if successful. Otherwise non-zero. */ -template -static int assignCentroids(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, - index_type_t n, - index_type_t d, - index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists, - index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) -{ +template +static int assignCentroids( + handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, + index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, + value_type_t* residual_host) { auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Compute distance between centroids and observation vectors CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream)); @@ -600,9 +574,11 @@ static int assignCentroids(handle_t const& handle, constexpr index_type_t grid_lower_bound{65535}; gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); gridDim.y = min(k, grid_lower_bound); - gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + gridDim.z = + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); - computeDistances<<>>(n, d, k, obs, centroids, dists); + computeDistances<<>>(n, d, k, obs, centroids, + dists); CHECK_CUDA(stream); // Find centroid closest to each observation vector @@ -610,21 +586,23 @@ static int assignCentroids(handle_t const& handle, blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, clusterSizes); + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, + clusterSizes); CHECK_CUDA(stream); // Compute residual sum of squares - *residual_host = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); + *residual_host = + thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n)); return 0; } -/** - * @brief Update cluster centroids for k-means algorithm. +/** + * @brief Update cluster centroids for k-means algorithm. * All clusters are assumed to be non-empty. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -650,31 +628,29 @@ static int assignCentroids(handle_t const& handle, * Workspace. * @return Zero if successful. Otherwise non-zero. */ -template +template static int updateCentroids(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, - index_type_t n, - index_type_t d, - index_type_t k, + thrust_exe_pol_t thrust_exec_policy, index_type_t n, + index_type_t d, index_type_t k, const value_type_t* __restrict__ obs, const index_type_t* __restrict__ codes, const index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, value_type_t* __restrict__ work, - index_type_t* __restrict__ work_int) -{ + index_type_t* __restrict__ work_int) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Useful constants - const value_type_t one = 1; + const value_type_t one = 1; const value_type_t zero = 0; constexpr index_type_t grid_lower_bound{65535}; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Device memory thrust::device_ptr obs_copy(work); @@ -682,56 +658,34 @@ static int updateCentroids(handle_t const& handle, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - CUBLAS_CHECK(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - d, - &one, - obs, - d, - &zero, - (value_type_t*)NULL, - n, - thrust::raw_pointer_cast(obs_copy), - n, - stream)); + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, + d, &zero, (value_type_t*)NULL, n, + thrust::raw_pointer_cast(obs_copy), n, stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, - rows, - rows + d * n, - thrust::make_constant_iterator(n), - rows, + thrust::transform(thrust_exec_policy, rows, rows + d * n, + thrust::make_constant_iterator(n), rows, thrust::modulus()); CHECK_CUDA(stream); - thrust::gather( - thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); + thrust::gather(thrust_exec_policy, rows, rows + d * n, + thrust::device_pointer_cast(codes), codes_copy); CHECK_CUDA(stream); // Row associated with each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, - rows, - rows + d * n, - thrust::make_constant_iterator(n), - rows, + thrust::transform(thrust_exec_policy, rows, rows + d * n, + thrust::make_constant_iterator(n), rows, thrust::divides()); CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster - thrust::stable_sort_by_key(thrust_exec_policy, - codes_copy, - codes_copy + d * n, + thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); CHECK_CUDA(stream); - thrust::reduce_by_key(thrust_exec_policy, - rows, - rows + d * n, - obs_copy, + thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, codes_copy, // Output to codes_copy is ignored thrust::device_pointer_cast(centroids)); CHECK_CUDA(stream); @@ -742,11 +696,12 @@ static int updateCentroids(handle_t const& handle, dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; // CUDA grid dimensions - dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), - 1}; + dim3 gridDim{ + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1}; - divideCentroids<<>>(d, k, clusterSizes, centroids); + divideCentroids<<>>(d, k, clusterSizes, + centroids); CHECK_CUDA(stream); return 0; @@ -760,8 +715,8 @@ namespace raft { // k-means algorithm // ========================================================= -/** - * @brief Find clusters with k-means algorithm. +/** + * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. @@ -799,24 +754,17 @@ namespace raft { * @param seed random seed to be used. * @return error flag. */ -template -int kmeans(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, - index_type_t n, - index_type_t d, - index_type_t k, - value_type_t tol, - index_type_t maxiter, - const value_type_t* __restrict__ obs, +template +int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, - value_type_t* __restrict__ work, - index_type_t* __restrict__ work_int, - value_type_t* residual_host, - index_type_t* iters_host, - unsigned long long seed) -{ + value_type_t* __restrict__ work, index_type_t* __restrict__ work_int, + value_type_t* residual_host, index_type_t* iters_host, + unsigned long long seed) { // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -838,120 +786,100 @@ int kmeans(handle_t const& handle, // ------------------------------------------------------- auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Trivial cases if (k == 1) { CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); - CUDA_TRY( - cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); - if (updateCentroids( - handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), + cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, + clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - 1, - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)}; + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), + grid_lower_bound)}; CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, work); + computeDistances<<>>(n, d, 1, obs, centroids, + work); CHECK_CUDA(stream); - *residual_host = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); + *residual_host = + thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), + thrust::device_pointer_cast(work + n)); CHECK_CUDA(stream); return 0; } if (n <= k) { - thrust::sequence(thrust_exec_policy, - thrust::device_pointer_cast(codes), + thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); CHECK_CUDA(stream); - thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); + thrust::fill_n(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), n, 1); CHECK_CUDA(stream); if (n < k) - CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync( - centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, + (k - n) * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return 0; } // Initialize cuBLAS - CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK( + linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids( - handle, thrust_exec_policy, n, d, k, obs, centroids, codes, clusterSizes, work, seed)) + if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + codes, clusterSizes, work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids( - handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) + if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes, + clusterSizes, centroids, work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(handle, - thrust_exec_policy, - n, - d, - k, - obs, - centroids, - work, - codes, - clusterSizes, - residual_host)) + if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids - index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); + index_type_t emptyCentroid = + (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), 0) - + thrust::device_pointer_cast(clusterSizes)); // FIXME: emptyCentroid never reaches k (infinite loop) under certain // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid(handle, - thrust_exec_policy, - n, - d, - k, - uniformDist(rng), - obs, - work, + if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, + uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(handle, - thrust_exec_policy, - n, - d, - k, - obs, - centroids, - work, - codes, - clusterSizes, - residual_host)) + if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids, + work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), - 0) - - thrust::device_pointer_cast(clusterSizes)); + emptyCentroid = + (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), 0) - + thrust::device_pointer_cast(clusterSizes)); CHECK_CUDA(stream); } @@ -963,13 +891,14 @@ int kmeans(handle_t const& handle, } // Warning if k-means has failed to converge - if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) + WARNING("k-means failed to converge"); *iters_host = iter; return 0; } -/** +/** * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with @@ -997,20 +926,13 @@ int kmeans(handle_t const& handle, * @param seed random seed to be used. * @return error flag */ -template -int kmeans(handle_t const& handle, - thrust_exe_pol_t thrust_exec_policy, - index_type_t n, - index_type_t d, - index_type_t k, - value_type_t tol, - index_type_t maxiter, - const value_type_t* __restrict__ obs, - index_type_t* __restrict__ codes, - value_type_t& residual, - index_type_t& iters, - unsigned long long seed = 123456) -{ +template +int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, + index_type_t n, index_type_t d, index_type_t k, value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, + index_type_t* __restrict__ codes, value_type_t& residual, + index_type_t& iters, unsigned long long seed = 123456) { using namespace matrix; // Check that parameters are valid @@ -1027,22 +949,10 @@ int kmeans(handle_t const& handle, vector_t work_int(handle, 2 * d * n); // Perform k-means - return kmeans(handle, - thrust_exec_policy, - n, - d, - k, - tol, - maxiter, - obs, - codes, - clusterSizes.raw(), - centroids.raw(), - work.raw(), - work_int.raw(), - &residual, - &iters, - seed); + return kmeans( + handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes, + clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual, + &iters, seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index 35fc22c770..d14bf05f37 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -21,125 +21,66 @@ #include #include -// for now; TODO: check if/where this `define` should be; +//for now; TODO: check if/where this `define` should be; // #define USE_LAPACK namespace raft { -#define lapackCheckError(status) \ - { \ - if (status < 0) { \ - std::stringstream ss; \ - ss << "Lapack error: argument number " << -status << " had an illegal value."; \ - throw exception(ss.str()); \ - } else if (status > 0) \ - RAFT_FAIL("Lapack error: internal error."); \ +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status \ + << " had an illegal value."; \ + throw exception(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ } -extern "C" void sgeqrf_( - int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info); -extern "C" void dgeqrf_( - int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info); -extern "C" void sormqr_(char* side, - char* trans, - int* m, - int* n, - int* k, - float* a, - int* lda, - const float* tau, - float* c, - int* ldc, - float* work, - int* lwork, - int* info); -extern "C" void dormqr_(char* side, - char* trans, - int* m, - int* n, - int* k, - double* a, - int* lda, - const double* tau, - double* c, - int* ldc, - double* work, - int* lwork, - int* info); -extern "C" int dgeev_(char* jobvl, - char* jobvr, - int* n, - double* a, - int* lda, - double* wr, - double* wi, - double* vl, - int* ldvl, - double* vr, - int* ldvr, - double* work, - int* lwork, - int* info); - -extern "C" int sgeev_(char* jobvl, - char* jobvr, - int* n, - float* a, - int* lda, - float* wr, - float* wi, - float* vl, - int* ldvl, - float* vr, - int* ldvr, - float* work, - int* lwork, - int* info); - -extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float* alpha, - const float* A, - int lda, - const float* B, - int ldb, - const float* beta, - float* C, - int ldc); - -extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const double* alpha, - const double* A, - int lda, - const double* B, - int ldb, - const double* beta, - double* C, - int ldc); - -extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info); - -extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info); - -extern "C" cusolverStatus_t cusolverDnSsteqrHost( - const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info); - -extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz, - int n, - double* d, - double* e, - double* z, - int ldz, - double* work, - int* info); +extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, + float *work, int *lwork, int *info); +extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, + double *work, int *lwork, int *info); +extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, + float *a, int *lda, const float *tau, float *c, + int *ldc, float *work, int *lwork, int *info); +extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, + double *a, int *lda, const double *tau, double *c, + int *ldc, double *work, int *lwork, int *info); +extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, + double *wr, double *wi, double *vl, int *ldvl, double *vr, + int *ldvr, double *work, int *lwork, int *info); + +extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, + float *wr, float *wi, float *vl, int *ldvl, float *vr, + int *ldvr, float *work, int *lwork, int *info); + +extern "C" cusolverStatus_t cusolverDnSgemmHost( + cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, + const float *alpha, const float *A, int lda, const float *B, int ldb, + const float *beta, float *C, int ldc); + +extern "C" cusolverStatus_t cusolverDnDgemmHost( + cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, + const double *alpha, const double *A, int lda, const double *B, int ldb, + const double *beta, double *C, int ldc); + +extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e, + int *info); + +extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e, + int *info); + +extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz, + int n, float *d, float *e, + float *z, int ldz, float *work, + int *info); + +extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz, + int n, double *d, double *e, + double *z, int ldz, + double *work, int *info); template class Lapack { @@ -150,339 +91,182 @@ class Lapack { public: static void check_lapack_enabled(); - static void gemm(bool transa, - bool transb, - int m, - int n, - int k, - T alpha, - const T* A, - int lda, - const T* B, - int ldb, - T beta, - T* C, + static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, int ldc); // special QR for lanczos - static void sterf(int n, T* d, T* e); - static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work); + static void sterf(int n, T *d, T *e); + static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work); // QR // computes the QR factorization of a general matrix - static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork); + static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. // multiply C by implicit Q - static void ormqr(bool right_side, - bool transq, - int m, - int n, - int k, - T* a, - int lda, - T* tau, - T* c, - int ldc, - T* work, - int* lwork); - - static void geev(T* A, T* eigenvalues, int dim, int lda); - static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); - static void geev(T* A, - T* eigenvalues_r, - T* eigenvalues_i, - T* eigenvectors_r, - T* eigenvectors_i, - int dim, - int lda, + static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, + int lda, T *tau, T *c, int ldc, T *work, int *lwork); + + static void geev(T *A, T *eigenvalues, int dim, int lda); + static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, int ldvr); + static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r, + T *eigenvectors_i, int dim, int lda, int ldvr); private: - static void lapack_gemm(const char transa, - const char transb, - int m, - int n, - int k, - float alpha, - const float* a, - int lda, - const float* b, - int ldb, - float beta, - float* c, - int ldc) - { - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnSgemmHost( - cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc); + static void lapack_gemm(const char transa, const char transb, int m, int n, + int k, float alpha, const float *a, int lda, + const float *b, int ldb, float beta, float *c, + int ldc) { + cublasOperation_t cublas_transa = + (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = + (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, + (float *)a, lda, (float *)b, ldb, &beta, c, ldc); } - static void lapack_gemm(const signed char transa, - const signed char transb, - int m, - int n, - int k, - double alpha, - const double* a, - int lda, - const double* b, - int ldb, - double beta, - double* c, - int ldc) - { - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnDgemmHost(cublas_transa, - cublas_transb, - m, - n, - k, - &alpha, - (double*)a, - lda, - (double*)b, - ldb, - &beta, - c, - ldc); + static void lapack_gemm(const signed char transa, const signed char transb, + int m, int n, int k, double alpha, const double *a, + int lda, const double *b, int ldb, double beta, + double *c, int ldc) { + cublasOperation_t cublas_transa = + (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = + (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, + (double *)a, lda, (double *)b, ldb, &beta, c, ldc); } - static void lapack_sterf(int n, float* d, float* e, int* info) - { + static void lapack_sterf(int n, float *d, float *e, int *info) { cusolverDnSsterfHost(n, d, e, info); } - static void lapack_sterf(int n, double* d, double* e, int* info) - { + static void lapack_sterf(int n, double *d, double *e, int *info) { cusolverDnDsterfHost(n, d, e, info); } - static void lapack_steqr( - const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info) - { + static void lapack_steqr(const signed char compz, int n, float *d, float *e, + float *z, int ldz, float *work, int *info) { cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_steqr(const signed char compz, - int n, - double* d, - double* e, - double* z, - int ldz, - double* work, - int* info) - { + static void lapack_steqr(const signed char compz, int n, double *d, double *e, + double *z, int ldz, double *work, int *info) { cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_geqrf( - int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info) - { + static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, + float *work, int *lwork, int *info) { sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_geqrf( - int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info) - { + static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, + double *work, int *lwork, int *info) { dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - float* a, - int lda, - float* tau, - float* c, - int ldc, - float* work, - int* lwork, - int* info) - { - sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a, + int lda, float *tau, float *c, int ldc, float *work, + int *lwork, int *info) { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, + info); } - static void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - double* a, - int lda, - double* tau, - double* c, - int ldc, - double* work, - int* lwork, - int* info) - { - dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); + static void lapack_ormqr(char side, char trans, int m, int n, int k, + double *a, int lda, double *tau, double *c, int ldc, + double *work, int *lwork, int *info) { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, + info); } - static int lapack_geev_dispatch(char* jobvl, - char* jobvr, - int* n, - double* a, - int* lda, - double* wr, - double* wi, - double* vl, - int* ldvl, - double* vr, - int* ldvr, - double* work, - int* lwork, - int* info) - { - return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a, + int *lda, double *wr, double *wi, double *vl, + int *ldvl, double *vr, int *ldvr, + double *work, int *lwork, int *info) { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, + lwork, info); } - static int lapack_geev_dispatch(char* jobvl, - char* jobvr, - int* n, - float* a, - int* lda, - float* wr, - float* wi, - float* vl, - int* ldvl, - float* vr, - int* ldvr, - float* work, - int* lwork, - int* info) - { - return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); + static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a, + int *lda, float *wr, float *wi, float *vl, + int *ldvl, float *vr, int *ldvr, float *work, + int *lwork, int *info) { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, + lwork, info); } // real eigenvalues - static void lapack_geev(T* A, T* eigenvalues, int dim, int lda) - { + static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) { char job = 'N'; std::vector WI(dim); - int ldv = 1; - T* vl = 0; + int ldv = 1; + T *vl = 0; int work_size = 6 * dim; std::vector work(work_size); int info; - lapack_geev_dispatch(&job, - &job, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldv, - vl, - &ldv, - work.data(), - &work_size, - &info); + lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl, + &ldv, vl, &ldv, work.data(), &work_size, &info); lapackCheckError(info); } // real eigenpairs - static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) - { + static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, + int lda, int ldvr) { char jobvl = 'N'; char jobvr = 'V'; std::vector WI(dim); int work_size = 6 * dim; - T* vl = 0; - int ldvl = 1; + T *vl = 0; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldvl, - eigenvectors, - &ldvr, - work.data(), - &work_size, - &info); + lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(), + vl, &ldvl, eigenvectors, &ldvr, work.data(), + &work_size, &info); lapackCheckError(info); } // complex eigenpairs - static void lapack_geev(T* A, - T* eigenvalues_r, - T* eigenvalues_i, - T* eigenvectors_r, - T* eigenvectors_i, - int dim, - int lda, - int ldvr) - { - char jobvl = 'N'; - char jobvr = 'V'; + static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i, + T *eigenvectors_r, T *eigenvectors_i, int dim, + int lda, int ldvr) { + char jobvl = 'N'; + char jobvr = 'V'; int work_size = 8 * dim; - int ldvl = 1; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues_r, - eigenvalues_i, - 0, - &ldvl, - eigenvectors_r, - &ldvr, - work.data(), - &work_size, - &info); + lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r, + eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr, + work.data(), &work_size, &info); lapackCheckError(info); } }; template -void Lapack::check_lapack_enabled() -{ +void Lapack::check_lapack_enabled() { #ifndef USE_LAPACK RAFT_FAIL("Error: LAPACK not enabled."); #endif } template -void Lapack::gemm(bool transa, - bool transb, - int m, - int n, - int k, - T alpha, - const T* A, - int lda, - const T* B, - int ldb, - T beta, - T* C, - int ldc) -{ +void Lapack::gemm(bool transa, bool transb, int m, int n, int k, T alpha, + const T *A, int lda, const T *B, int ldb, T beta, T *C, + int ldc) { // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK const char transA_char = transa ? 'T' : 'N'; const char transB_char = transb ? 'T' : 'N'; - lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, + ldc); //#endif } template -void Lapack::sterf(int n, T* d, T* e) -{ +void Lapack::sterf(int n, T *d, T *e) { // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -492,8 +276,7 @@ void Lapack::sterf(int n, T* d, T* e) } template -void Lapack::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work) -{ +void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -503,8 +286,8 @@ void Lapack::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work) } template -void Lapack::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork) -{ +void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, + int *lwork) { check_lapack_enabled(); #ifdef USE_LAPACK int info; @@ -513,22 +296,11 @@ void Lapack::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork) #endif } template -void Lapack::ormqr(bool right_side, - bool transq, - int m, - int n, - int k, - T* a, - int lda, - T* tau, - T* c, - int ldc, - T* work, - int* lwork) -{ +void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, + int lda, T *tau, T *c, int ldc, T *work, int *lwork) { check_lapack_enabled(); #ifdef USE_LAPACK - char side = right_side ? 'R' : 'L'; + char side = right_side ? 'R' : 'L'; char trans = transq ? 'T' : 'N'; int info; lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); @@ -538,8 +310,7 @@ void Lapack::ormqr(bool right_side, // real eigenvalues template -void Lapack::geev(T* A, T* eigenvalues, int dim, int lda) -{ +void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, dim, lda); @@ -547,8 +318,8 @@ void Lapack::geev(T* A, T* eigenvalues, int dim, int lda) } // real eigenpairs template -void Lapack::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) -{ +void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + int ldvr) { check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); @@ -556,18 +327,13 @@ void Lapack::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, in } // complex eigenpairs template -void Lapack::geev(T* A, - T* eigenvalues_r, - T* eigenvalues_i, - T* eigenvectors_r, - T* eigenvectors_i, - int dim, - int lda, - int ldvr) -{ +void Lapack::geev(T *A, T *eigenvalues_r, T *eigenvalues_i, + T *eigenvectors_r, T *eigenvectors_i, int dim, int lda, + int ldvr) { check_lapack_enabled(); #ifdef USE_LAPACK - lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, + dim, lda, ldvr); #endif } diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 89d2b7e8ec..c43154d17a 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -40,12 +40,10 @@ using size_type = int; // for now; TODO: move it in appropriate header // Apply diagonal matrix to vector: // template -static __global__ void diagmv(IndexType_ n, - ValueType_ alpha, +static __global__ void diagmv(IndexType_ n, ValueType_ alpha, const ValueType_* __restrict__ D, const ValueType_* __restrict__ x, - ValueType_* __restrict__ y) -{ + ValueType_* __restrict__ y) { IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { y[i] += alpha * D[i] * x[i]; @@ -60,7 +58,7 @@ enum struct sparse_mv_alg_t : int { SPARSE_MV_UNDEFINED = -1, SPARSE_MV_ALG_DEFAULT, // generic, for any sparse matrix SPARSE_MV_ALG1, // typical for CSR - SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices + SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices }; // Vector "view"-like aggregate for linear algebra purposes @@ -70,21 +68,21 @@ struct vector_view_t { value_type* buffer_; size_type size_; - vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {} + vector_view_t(value_type* buffer, size_type sz) + : buffer_(buffer), size_(sz) {} - vector_view_t(vector_view_t&& other) : buffer_(other.buffer_), size_(other.size_) - { + vector_view_t(vector_view_t&& other) + : buffer_(other.buffer_), size_(other.size_) { other.buffer_ = nullptr; - other.size_ = 0; + other.size_ = 0; } - vector_view_t& operator=(vector_view_t&& other) - { + vector_view_t& operator=(vector_view_t&& other) { buffer_ = other.buffer_; - size_ = other.size_; + size_ = other.size_; other.buffer_ = nullptr; - other.size_ = 0; + other.size_ = 0; } }; @@ -100,16 +98,15 @@ class vector_t { public: vector_t(handle_t const& raft_handle, size_type sz) : handle_(raft_handle), - buffer_(static_cast(raft_handle.get_device_allocator()->allocate( - sz * sizeof(value_type), raft_handle.get_stream()))), + buffer_( + static_cast(raft_handle.get_device_allocator()->allocate( + sz * sizeof(value_type), raft_handle.get_stream()))), size_(sz), - stream_(raft_handle.get_stream()) - { - } + stream_(raft_handle.get_stream()) {} - ~vector_t(void) - { - handle_.get_device_allocator()->deallocate(buffer_, size_ * sizeof(value_type), stream_); + ~vector_t(void) { + handle_.get_device_allocator()->deallocate( + buffer_, size_ * sizeof(value_type), stream_); } size_type size(void) const { return size_; } @@ -119,31 +116,26 @@ class vector_t { value_type const* raw(void) const { return buffer_; } template - value_type nrm1(ThrustExecPolicy t_exe_pol) const - { - return thrust::reduce( - t_exe_pol, buffer_, buffer_ + size_, value_type{0}, [] __device__(auto left, auto right) { - auto abs_left = left > 0 ? left : -left; - auto abs_right = right > 0 ? right : -right; - return abs_left + abs_right; - }); + value_type nrm1(ThrustExecPolicy t_exe_pol) const { + return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0}, + [] __device__(auto left, auto right) { + auto abs_left = left > 0 ? left : -left; + auto abs_right = right > 0 ? right : -right; + return abs_left + abs_right; + }); } template - void fill(ThrustExecPolicy t_exe_pol, value_type value) - { + void fill(ThrustExecPolicy t_exe_pol, value_type value) { thrust::fill_n(t_exe_pol, buffer_, size_, value); } }; template struct sparse_matrix_t { - sparse_matrix_t(handle_t const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const ncols, + sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const ncols, index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), @@ -151,25 +143,18 @@ struct sparse_matrix_t { values_(values), nrows_(nrows), ncols_(ncols), - nnz_(nnz) - { - } + nnz_(nnz) {} - sparse_matrix_t(handle_t const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz) + sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), col_indices_(col_indices), values_(values), nrows_(nrows), ncols_(nrows), - nnz_(nnz) - { - } + nnz_(nnz) {} template sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view) @@ -179,9 +164,7 @@ struct sparse_matrix_t { values_(csr_view.edge_data), nrows_(csr_view.number_of_vertices), ncols_(csr_view.number_of_vertices), - nnz_(csr_view.number_of_edges) - { - } + nnz_(csr_view.number_of_edges) {} virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types @@ -191,24 +174,21 @@ struct sparse_matrix_t { // descriptor creation works with non-const, and const-casting // down is dangerous) // - virtual void mv(value_type alpha, - value_type* __restrict__ x, - value_type beta, + virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, - bool symmetric = false) const - { + bool transpose = false, bool symmetric = false) const { using namespace sparse; RAFT_EXPECTS(x != nullptr, "Null x buffer."); RAFT_EXPECTS(y != nullptr, "Null y buffer."); auto cusparse_h = handle_.get_cusparse_handle(); - auto stream = handle_.get_stream(); + auto stream = handle_.get_stream(); - cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose - CUSPARSE_OPERATION_NON_TRANSPOSE; // non-transpose + cusparseOperation_t trans = + transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose + CUSPARSE_OPERATION_NON_TRANSPOSE; //non-transpose #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP auto size_x = transpose ? nrows_ : ncols_; @@ -216,19 +196,15 @@ struct sparse_matrix_t { cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg); - // create descriptors: + //create descriptors: //(below casts are necessary, because // cusparseCreateCsr(...) takes non-const // void*; the casts should be harmless) // cusparseSpMatDescr_t matA; - CUSPARSE_CHECK(cusparsecreatecsr(&matA, - nrows_, - ncols_, - nnz_, - const_cast(row_offsets_), - const_cast(col_indices_), - const_cast(values_))); + CUSPARSE_CHECK(cusparsecreatecsr( + &matA, nrows_, ncols_, nnz_, const_cast(row_offsets_), + const_cast(col_indices_), const_cast(values_))); cusparseDnVecDescr_t vecX; CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x)); @@ -236,29 +212,31 @@ struct sparse_matrix_t { cusparseDnVecDescr_t vecY; CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y)); - // get (scratch) external device buffer size: + //get (scratch) external device buffer size: // size_t bufferSize; - CUSPARSE_CHECK(cusparsespmv_buffersize( - cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream)); + CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA, + vecX, &beta, vecY, spmv_alg, + &bufferSize, stream)); - // allocate external buffer: + //allocate external buffer: // vector_t external_buffer(handle_, bufferSize); - // finally perform SpMV: + //finally perform SpMV: // - CUSPARSE_CHECK(cusparsespmv( - cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream)); + CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta, + vecY, spmv_alg, external_buffer.raw(), stream)); - // free descriptors: + //free descriptors: //(TODO: maybe wrap them in a RAII struct?) // CUSPARSE_CHECK(cusparseDestroyDnVec(vecY)); CUSPARSE_CHECK(cusparseDestroyDnVec(vecX)); CUSPARSE_CHECK(cusparseDestroySpMat(matA)); #else - CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); + CUSPARSE_CHECK( + cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); cusparseMatDescr_t descr = 0; CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); if (symmetric) { @@ -267,20 +245,9 @@ struct sparse_matrix_t { CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); } CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, - trans, - nrows_, - ncols_, - nnz_, - &alpha, - descr, - values_, - row_offsets_, - col_indices_, - x, - &beta, - y, - stream)); + CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_, + &alpha, descr, values_, row_offsets_, + col_indices_, x, &beta, y, stream)); CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); #endif } @@ -288,18 +255,19 @@ struct sparse_matrix_t { handle_t const& get_handle(void) const { return handle_; } #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP - cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const - { + cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const { switch (alg) { - case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_CSRMV_ALG1; - case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_CSRMV_ALG2; - default: return CUSPARSE_MV_ALG_DEFAULT; + case sparse_mv_alg_t::SPARSE_MV_ALG1: + return CUSPARSE_CSRMV_ALG1; + case sparse_mv_alg_t::SPARSE_MV_ALG2: + return CUSPARSE_CSRMV_ALG2; + default: + return CUSPARSE_MV_ALG_DEFAULT; } } #endif - // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, - // aggregate + //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate handle_t const& handle_; index_type const* row_offsets_; @@ -316,51 +284,44 @@ struct laplacian_matrix_t : sparse_matrix_t { laplacian_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz) - : sparse_matrix_t( - raft_handle, row_offsets, col_indices, values, nrows, nnz), - diagonal_(raft_handle, nrows) - { + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) + : sparse_matrix_t(raft_handle, row_offsets, + col_indices, values, nrows, nnz), + diagonal_(raft_handle, nrows) { vector_t ones{raft_handle, nrows}; ones.fill(thrust_exec_policy, 1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, + diagonal_.raw()); } template laplacian_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) - : sparse_matrix_t(raft_handle, - csr_m.row_offsets_, - csr_m.col_indices_, - csr_m.values_, - csr_m.nrows_, - csr_m.nnz_), - diagonal_(raft_handle, csr_m.nrows_) - { + : sparse_matrix_t(raft_handle, csr_m.row_offsets_, + csr_m.col_indices_, csr_m.values_, + csr_m.nrows_, csr_m.nnz_), + diagonal_(raft_handle, csr_m.nrows_) { vector_t ones{raft_handle, csr_m.nrows_}; ones.fill(thrust_exec_policy, 1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, + diagonal_.raw()); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, - value_type* __restrict__ x, - value_type beta, + void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, - bool symmetric = false) const override - { + bool transpose = false, bool symmetric = false) const override { constexpr int BLOCK_SIZE = 1024; - auto n = sparse_matrix_t::nrows_; + auto n = sparse_matrix_t::nrows_; - auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = + sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = + sparse_matrix_t::get_handle().get_stream(); // scales y by beta: // @@ -372,7 +333,8 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply diagonal matrix // - dim3 gridDim{std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + dim3 gridDim{ + std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; dim3 blockDim{BLOCK_SIZE, 1, 1}; diagmv<<>>(n, alpha, diagonal_.raw(), x, y); @@ -380,7 +342,8 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply adjacency matrix // - sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, symmetric); + sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, + symmetric); } vector_t diagonal_; @@ -392,68 +355,58 @@ struct modularity_matrix_t : laplacian_matrix_t { modularity_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz) + index_type const* col_indices, value_type const* values, + index_type const nrows, index_type const nnz) : laplacian_matrix_t( - raft_handle, thrust_exec_policy, row_offsets, col_indices, values, nrows, nnz) - { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(thrust_exec_policy); + raft_handle, thrust_exec_policy, row_offsets, col_indices, values, + nrows, nnz) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( + thrust_exec_policy); } template modularity_matrix_t(handle_t const& raft_handle, ThrustExePolicy thrust_exec_policy, sparse_matrix_t const& csr_m) - : laplacian_matrix_t(raft_handle, thrust_exec_policy, csr_m) - { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(thrust_exec_policy); + : laplacian_matrix_t(raft_handle, + thrust_exec_policy, csr_m) { + edge_sum_ = laplacian_matrix_t::diagonal_.nrm1( + thrust_exec_policy); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, - value_type* __restrict__ x, - value_type beta, + void mv(value_type alpha, value_type* __restrict__ x, value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, - bool symmetric = false) const override - { + bool transpose = false, bool symmetric = false) const override { auto n = sparse_matrix_t::nrows_; - auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = + sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = + sparse_matrix_t::get_handle().get_stream(); // y = A*x // - sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, symmetric); + sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, + symmetric); value_type dot_res; // gamma = d'*x // // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - CUBLAS_CHECK(linalg::cublasdot(cublas_h, - n, - laplacian_matrix_t::diagonal_.raw(), - 1, - x, - 1, - &dot_res, - stream)); + CUBLAS_CHECK(linalg::cublasdot( + cublas_h, n, laplacian_matrix_t::diagonal_.raw(), + 1, x, 1, &dot_res, stream)); // y = y -(gamma/edge_sum)*d // value_type gamma_ = -dot_res / edge_sum_; - CUBLAS_CHECK(linalg::cublasaxpy(cublas_h, - n, - &gamma_, - laplacian_matrix_t::diagonal_.raw(), - 1, - y, - 1, - stream)); + CUBLAS_CHECK(linalg::cublasaxpy( + cublas_h, n, &gamma_, + laplacian_matrix_t::diagonal_.raw(), 1, y, 1, + stream)); } value_type edge_sum_; diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index bb7087a3be..f8dfe5daa3 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -40,8 +40,7 @@ #endif #ifdef COLLECT_TIME_STATISTICS -static double timer(void) -{ +static double timer(void) { struct timeval tv; cudaDeviceSynchronize(); gettimeofday(&tv, NULL); @@ -80,27 +79,19 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( - handle_t const& handle, - ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const& csr_m, - EigenSolver const& eigen_solver, - ClusterSolver const& cluster_solver, - vertex_t* __restrict__ clusters, - weight_t* eigVals, - weight_t* eigVecs) -{ + handle_t const &handle, ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const &csr_m, + EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, + vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); std::tuple stats; // # iters eigen solver, cluster solver residual, # iters cluster solver @@ -113,10 +104,11 @@ std::tuple modularity_maximization( modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute eigenvectors corresponding to largest eigenvalues - std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); + std::get<0>(stats) = + eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); @@ -127,8 +119,8 @@ std::tuple modularity_maximization( CHECK_CUDA(stream); // Find partition clustering - auto pair_cluster = - cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters); + auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, + nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -147,13 +139,12 @@ std::tuple modularity_maximization( * @param modularity On exit, modularity */ template -void analyzeModularity(handle_t const& handle, +void analyzeModularity(handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const& csr_m, + sparse_matrix_t const &csr_m, vertex_t nClusters, - vertex_t const* __restrict__ clusters, - weight_t& modularity) -{ + vertex_t const *__restrict__ clusters, + weight_t &modularity) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; @@ -161,14 +152,15 @@ void analyzeModularity(handle_t const& handle, weight_t partModularity, clustersize; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Device memory vector_t part_i(handle, n); vector_t Bx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity modularity_matrix_t B{handle, thrust_exec_policy, csr_m}; @@ -178,8 +170,8 @@ void analyzeModularity(handle_t const& handle, // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator( - handle, thrust_exec_policy, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { + if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, + partModularity, clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index e2576c1d69..841fca04d9 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -62,30 +62,22 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template -std::tuple partition(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const& csr_m, - EigenSolver const& eigen_solver, - ClusterSolver const& cluster_solver, - vertex_t* __restrict__ clusters, - weight_t* eigVals, - weight_t* eigVecs) -{ +template +std::tuple partition( + handle_t const &handle, ThrustExePolicy thrust_exec_policy, + sparse_matrix_t const &csr_m, + EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, + vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); std::tuple - stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, - //cluster solver residual, # iters cluster solver + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver vertex_t n = csr_m.nrows_; @@ -96,21 +88,22 @@ std::tuple partition(handle_t const& handle, // Compute eigenvectors of Laplacian // Initialize Laplacian - /// sparse_matrix_t A{handle, graph}; + ///sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute smallest eigenvalues and eigenvectors - std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); + std::get<0>(stats) = + eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs); // Find partition clustering - auto pair_cluster = - cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters); + auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n, + nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -137,21 +130,18 @@ std::tuple partition(handle_t const& handle, * @return error flag. */ template -void analyzePartition(handle_t const& handle, +void analyzePartition(handle_t const &handle, ThrustExePolicy thrust_exec_policy, - sparse_matrix_t const& csr_m, - vertex_t nClusters, - const vertex_t* __restrict__ clusters, - weight_t& edgeCut, - weight_t& cost) -{ + sparse_matrix_t const &csr_m, + vertex_t nClusters, const vertex_t *__restrict__ clusters, + weight_t &edgeCut, weight_t &cost) { RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; vertex_t n = csr_m.nrows_; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); weight_t partEdgesCut, clustersize; @@ -160,21 +150,22 @@ void analyzePartition(handle_t const& handle, vector_t Lx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian - /// sparse_matrix_t A{handle, graph}; + ///sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, thrust_exec_policy, csr_m}; // Initialize output - cost = 0; + cost = 0; edgeCut = 0; // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator( - handle, thrust_exec_policy, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { + if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize, + partEdgesCut, clusters, part_i, Lx, L)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index 5349cb2810..40dde30a74 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -28,18 +28,20 @@ namespace raft { namespace spectral { template -static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs) -{ +static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, + value_type_t* obs) { index_type_t i, j, k, index, mm; value_type_t alpha, v, last; bool valid; // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x + mm = (((m + blockDim.x - 1) / blockDim.x) * + blockDim.x); // m in multiple of blockDim.x alpha = 0.0; - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < mm; i += blockDim.x) { // check if the thread is valid valid = i < m; @@ -64,17 +66,17 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_ty // scale by alpha alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x); alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; + j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; + index = i + j * m; obs[index] = obs[index] / alpha; } } } template -index_type_t next_pow2(index_type_t n) -{ +index_type_t next_pow2(index_type_t n) { index_type_t v; // Reference: // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float @@ -88,8 +90,7 @@ index_type_t next_pow2(index_type_t n) } template -cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) -{ +cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { index_type_t p2m; // find next power of 2 @@ -101,20 +102,19 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1}; // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); + scale_obs_kernel + <<>>(m, n, obs); return cudaSuccess; } -template +template void transform_eigen_matrix(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, - edge_t n, - vertex_t nEigVecs, - weight_t* eigVecs) -{ + ThrustExePolicy thrust_exec_policy, edge_t n, + vertex_t nEigVecs, weight_t* eigVecs) { auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); const weight_t zero{0.0}; const weight_t one{1.0}; @@ -123,9 +123,9 @@ void transform_eigen_matrix(handle_t const& handle, for (auto i = 0; i < nEigVecs; ++i) { weight_t mean, std; - mean = thrust::reduce(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + mean = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); CHECK_CUDA(stream); mean /= n; thrust::transform(thrust_exec_policy, @@ -136,7 +136,8 @@ void transform_eigen_matrix(handle_t const& handle, thrust::minus()); CHECK_CUDA(stream); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + CUBLAS_CHECK( + cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); std /= std::sqrt(static_cast(n)); @@ -153,25 +154,16 @@ void transform_eigen_matrix(handle_t const& handle, // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t*)NULL, - nEigVecs, - work.raw(), - nEigVecs, - stream)); - - CUDA_TRY(cudaMemcpyAsync( - eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); + CUBLAS_CHECK( + cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, + &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, + work.raw(), nEigVecs, stream)); + + CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), + nEigVecs * n * sizeof(weight_t), + cudaMemcpyDeviceToDevice, stream)); } } @@ -186,48 +178,49 @@ struct equal_to_i_op { public: equal_to_i_op(index_type_t _i) : i(_i) {} template - __host__ __device__ void operator()(Tuple_ t) - { - thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; + __host__ __device__ void operator()(Tuple_ t) { + thrust::get<1>(t) = + (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; } }; } // namespace // Construct indicator vector for ith partition // -template +template bool construct_indicator(handle_t const& handle, - ThrustExePolicy thrust_exec_policy, - edge_t index, - edge_t n, - weight_t& clustersize, - weight_t& partStats, + ThrustExePolicy thrust_exec_policy, edge_t index, + edge_t n, weight_t& clustersize, weight_t& partStats, vertex_t const* __restrict__ clusters, - vector_t& part_i, - vector_t& Bx, - laplacian_matrix_t const& B) -{ + vector_t& part_i, vector_t& Bx, + laplacian_matrix_t const& B) { auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); - - thrust::for_each( - thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(index)); + auto stream = handle.get_stream(); + + thrust::for_each(thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple( + thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(index)); CHECK_CUDA(stream); // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, + &clustersize, stream)); clustersize = round(clustersize); - if (clustersize < 0.5) { return false; } + if (clustersize < 0.5) { + return false; + } // Compute part stats B.mv(1, part_i.raw(), 0, Bx.raw()); - CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + CUBLAS_CHECK( + cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); return true; } diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp index 08a4e6efb5..406f1b7c7e 100644 --- a/cpp/include/raft/spectral/warn_dbg.hpp +++ b/cpp/include/raft/spectral/warn_dbg.hpp @@ -4,13 +4,13 @@ #include #define STRINGIFY_DETAIL(x) #x -#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) +#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) #ifdef DEBUG #define COUT() (std::cout) #define CERR() (std::cerr) -// nope: +//nope: // #define WARNING(message) \ do { \ diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh index 4d6724482c..8691cabc85 100644 --- a/cpp/include/raft/stats/mean.cuh +++ b/cpp/include/raft/stats/mean.cuh @@ -26,15 +26,15 @@ namespace stats { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ +__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, + IdxType N) { const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -46,8 +46,8 @@ __global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxTyp } template -__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ +__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, + IdxType N) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -57,7 +57,9 @@ __global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxTyp thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; } + if (threadIdx.x == 0) { + mu[blockIdx.x] = acc / N; + } } /** @@ -78,22 +80,24 @@ __global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxTyp * @param stream: cuda stream */ template -void mean( - Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) -{ +void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, + bool rowMajor, cudaStream_t stream) { static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), + raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); - meanKernelRowMajor<<>>(mu, data, D, N); + meanKernelRowMajor + <<>>(mu, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); } else { - meanKernelColMajor<<>>(mu, data, D, N); + meanKernelColMajor + <<>>(mu, data, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh index c0ba24312b..04934d4388 100644 --- a/cpp/include/raft/stats/mean_center.cuh +++ b/cpp/include/raft/stats/mean_center.cuh @@ -38,25 +38,12 @@ namespace stats { * @param stream cuda stream where to launch work */ template -void meanCenter(Type* out, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ +void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, + IdxType N, bool rowMajor, bool bcastAlongRows, + cudaStream_t stream) { raft::linalg::matrixVectorOp( - out, - data, - mu, - D, - N, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, - stream); + out, data, mu, D, N, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, stream); } /** @@ -74,25 +61,11 @@ void meanCenter(Type* out, * @param stream cuda stream where to launch work */ template -void meanAdd(Type* out, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ +void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N, + bool rowMajor, bool bcastAlongRows, cudaStream_t stream) { raft::linalg::matrixVectorOp( - out, - data, - mu, - D, - N, - rowMajor, - bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, - stream); + out, data, mu, D, N, rowMajor, bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, stream); } }; // end namespace stats diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh index 1dd9cd56bc..f12c633829 100644 --- a/cpp/include/raft/stats/stddev.cuh +++ b/cpp/include/raft/stats/stddev.cuh @@ -26,15 +26,15 @@ namespace stats { ///@todo: ColPerBlk has been tested only for 32! template -__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N) -{ +__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, + IdxType N) { const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) { Type val = (colId < D) ? data[i * D + colId] : Type(0); thread_data += val * val; @@ -48,39 +48,41 @@ __global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, Idx } template -__global__ void stddevKernelColMajor( - Type* std, const Type* data, const Type* mu, IdxType D, IdxType N) -{ +__global__ void stddevKernelColMajor(Type *std, const Type *data, + const Type *mu, IdxType D, IdxType N) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); } + if (threadIdx.x == 0) { + std[blockIdx.x] = raft::mySqrt(acc / N); + } } template -__global__ void varsKernelColMajor( - Type* var, const Type* data, const Type* mu, IdxType D, IdxType N) -{ +__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, + IdxType D, IdxType N) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; } + if (threadIdx.x == 0) { + var[blockIdx.x] = acc / N; + } } /** @@ -102,33 +104,28 @@ __global__ void varsKernelColMajor( * @param stream cuda stream where to launch work */ template -void stddev(Type* std, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool sample, - bool rowMajor, - cudaStream_t stream) -{ +void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, + bool sample, bool rowMajor, cudaStream_t stream) { static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), + raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D)); - stddevKernelRowMajor<<>>(std, data, D, N); + stddevKernelRowMajor + <<>>(std, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - std, - std, - mu, - D, - [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); }, + std, std, mu, D, + [ratio] __device__(Type a, Type b) { + return raft::mySqrt(a * ratio - b * b); + }, stream); } else { - stddevKernelColMajor<<>>(std, data, mu, D, N); + stddevKernelColMajor + <<>>(std, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } @@ -152,28 +149,25 @@ void stddev(Type* std, * @param stream cuda stream where to launch work */ template -void vars(Type* var, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool sample, - bool rowMajor, - cudaStream_t stream) -{ +void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, + bool sample, bool rowMajor, cudaStream_t stream) { static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), + raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D)); - stddevKernelRowMajor<<>>(var, data, D, N); + stddevKernelRowMajor + <<>>(var, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); + var, var, mu, D, + [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); } else { - varsKernelColMajor<<>>(var, data, mu, D, N); + varsKernelColMajor + <<>>(var, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh index c7b8ce12b6..5f8416c7e2 100644 --- a/cpp/include/raft/stats/sum.cuh +++ b/cpp/include/raft/stats/sum.cuh @@ -26,15 +26,15 @@ namespace stats { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ +__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, + IdxType N) { const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -46,8 +46,8 @@ __global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType } template -__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ +__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, + IdxType N) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -57,7 +57,9 @@ __global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { mu[blockIdx.x] = acc; } + if (threadIdx.x == 0) { + mu[blockIdx.x] = acc; + } } /** @@ -75,19 +77,21 @@ __global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType * @param stream cuda stream where to launch work */ template -void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) -{ +void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, + cudaStream_t stream) { static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), + raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D)); sumKernelRowMajor <<>>(output, input, D, N); } else { - sumKernelColMajor<<>>(output, input, D, N); + sumKernelColMajor + <<>>(output, input, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh index 1e0885fb99..1829fc0351 100644 --- a/cpp/include/raft/vectorized.cuh +++ b/cpp/include/raft/vectorized.cuh @@ -22,11 +22,11 @@ namespace raft { template -struct IOType { -}; +struct IOType {}; template <> struct IOType { - static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed"); + static_assert(sizeof(bool) == sizeof(int8_t), + "IOType bool size assumption failed"); typedef int8_t Type; }; template <> @@ -215,42 +215,42 @@ struct IOType { }; /** - * @struct TxN_t - * - * @brief Internal data structure that is used to define a facade for vectorized - * loads/stores across the most common POD types. The goal of his file is to - * provide with CUDA programmers, an easy way to have compiler issue vectorized - * load or store instructions to memory (either global or shared). Vectorized - * accesses to memory are important as they'll utilize its resources - * efficiently, - * when compared to their non-vectorized counterparts. Obviously, for whatever - * reasons if one is unable to issue such vectorized operations, one can always - * fallback to using POD types. - * - * Example demonstrating the use of load operations, performing math on such - * loaded data and finally storing it back. - * @code{.cu} - * TxN_t mydata1, mydata2; - * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; - * mydata1.load(ptr1, idx); - * mydata2.load(ptr2, idx); - * #pragma unroll - * for(int i=0;i type. - * Only change required is to replace variable declaration appropriately. - * - * Obviously, it's caller's responsibility to take care of pointer alignment! - * - * @tparam math_ the data-type in which the compute/math needs to happen - * @tparam veclen_ the number of 'math_' types to be loaded/stored per - * instruction - */ + * @struct TxN_t + * + * @brief Internal data structure that is used to define a facade for vectorized + * loads/stores across the most common POD types. The goal of his file is to + * provide with CUDA programmers, an easy way to have compiler issue vectorized + * load or store instructions to memory (either global or shared). Vectorized + * accesses to memory are important as they'll utilize its resources + * efficiently, + * when compared to their non-vectorized counterparts. Obviously, for whatever + * reasons if one is unable to issue such vectorized operations, one can always + * fallback to using POD types. + * + * Example demonstrating the use of load operations, performing math on such + * loaded data and finally storing it back. + * @code{.cu} + * TxN_t mydata1, mydata2; + * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; + * mydata1.load(ptr1, idx); + * mydata2.load(ptr2, idx); + * #pragma unroll + * for(int i=0;i type. + * Only change required is to replace variable declaration appropriately. + * + * Obviously, it's caller's responsibility to take care of pointer alignment! + * + * @tparam math_ the data-type in which the compute/math needs to happen + * @tparam veclen_ the number of 'math_' types to be loaded/stored per + * instruction + */ template struct TxN_t { /** underlying math data type */ @@ -274,8 +274,7 @@ struct TxN_t { * @brief Fill the contents of this structure with a constant value * @param _val the constant to be filled */ - DI void fill(math_t _val) - { + DI void fill(math_t _val) { #pragma unroll for (int i = 0; i < Ratio; ++i) { val.data[i] = _val; @@ -300,24 +299,21 @@ struct TxN_t { * @{ */ template - DI void load(const math_t* ptr, idx_t idx) - { - const io_t* bptr = reinterpret_cast(&ptr[idx]); - val.internal = __ldg(bptr); + DI void load(const math_t *ptr, idx_t idx) { + const io_t *bptr = reinterpret_cast(&ptr[idx]); + val.internal = __ldg(bptr); } template - DI void load(math_t* ptr, idx_t idx) - { - io_t* bptr = reinterpret_cast(&ptr[idx]); + DI void load(math_t *ptr, idx_t idx) { + io_t *bptr = reinterpret_cast(&ptr[idx]); val.internal = *bptr; } template - DI void store(math_t* ptr, idx_t idx) - { - io_t* bptr = reinterpret_cast(&ptr[idx]); - *bptr = val.internal; + DI void store(math_t *ptr, idx_t idx) { + io_t *bptr = reinterpret_cast(&ptr[idx]); + *bptr = val.internal; } /** @} */ }; @@ -334,17 +330,11 @@ struct TxN_t { DI void fill(math_t _val) {} template - DI void load(const math_t* ptr, idx_t idx) - { - } + DI void load(const math_t *ptr, idx_t idx) {} template - DI void load(math_t* ptr, idx_t idx) - { - } + DI void load(math_t *ptr, idx_t idx) {} template - DI void store(math_t* ptr, idx_t idx) - { - } + DI void store(math_t *ptr, idx_t idx) {} }; } // namespace raft diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 284a873dec..4ff6cdf5fa 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -23,8 +23,7 @@ namespace raft { -TEST(Raft, ClusterSolvers) -{ +TEST(Raft, ClusterSolvers) { using namespace matrix; using index_type = int; using value_type = double; @@ -41,7 +40,7 @@ TEST(Raft, ClusterSolvers) index_type d{10}; index_type k{5}; - // nullptr expected to trigger exceptions: + //nullptr expected to trigger exceptions: // value_type* eigvecs{nullptr}; index_type* codes{nullptr}; @@ -50,11 +49,11 @@ TEST(Raft, ClusterSolvers) kmeans_solver_t cluster_solver{cfg}; - EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, eigvecs, codes)); + EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, + eigvecs, codes)); } -TEST(Raft, ModularitySolvers) -{ +TEST(Raft, ModularitySolvers) { using namespace matrix; using index_type = int; using value_type = double; @@ -69,7 +68,7 @@ TEST(Raft, ModularitySolvers) value_type tol{1.0e-10}; bool reorthog{true}; - // nullptr expected to trigger exceptions: + //nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -83,18 +82,21 @@ TEST(Raft, ModularitySolvers) index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, + seed}; kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); - sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; + sparse_matrix_t sm{h, nullptr, nullptr, + nullptr, 0, 0}; auto t_exe_p = thrust::cuda::par.on(stream); EXPECT_ANY_THROW(spectral::modularity_maximization( h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type modularity{0}; - EXPECT_ANY_THROW(spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); + EXPECT_ANY_THROW( + spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity)); } } // namespace raft diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp index 150767992f..c14d880efd 100644 --- a/cpp/test/cudart_utils.cpp +++ b/cpp/test/cudart_utils.cpp @@ -20,8 +20,7 @@ namespace raft { -TEST(Raft, Utils) -{ +TEST(Raft, Utils) { ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!")); ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception); ASSERT_THROW(THROW("Should throw!"), exception); diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index 9ed32b80ef..e2ed2c01dc 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -25,42 +25,30 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceAdjKernel(bool* dist, - const DataType* x, - const DataType* y, - int m, - int n, - int k, - DataType eps, - bool isRowMajor) -{ +__global__ void naiveDistanceAdjKernel(bool *dist, const DataType *x, + const DataType *y, int m, int n, int k, + DataType eps, bool isRowMajor) { int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc <= eps; } template -void naiveDistanceAdj(bool* dist, - const DataType* x, - const DataType* y, - int m, - int n, - int k, - DataType eps, - bool isRowMajor) -{ +void naiveDistanceAdj(bool *dist, const DataType *x, const DataType *y, int m, + int n, int k, DataType eps, bool isRowMajor) { static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); - naiveDistanceAdjKernel<<>>(dist, x, y, m, n, k, eps, isRowMajor); + naiveDistanceAdjKernel + <<>>(dist, x, y, m, n, k, eps, isRowMajor); CUDA_CHECK(cudaPeekAtLastError()); } @@ -73,21 +61,21 @@ struct DistanceAdjInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const DistanceAdjInputs &dims) { return os; } template -class DistanceAdjTest : public ::testing::TestWithParam> { +class DistanceAdjTest + : public ::testing::TestWithParam> { public: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; bool isRowMajor = params.isRowMajor; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -101,23 +89,25 @@ class DistanceAdjTest : public ::testing::TestWithParam( - x, y, m, n, k); - if (worksize != 0) { raft::allocate(workspace, worksize); } + char *workspace = nullptr; + size_t worksize = + raft::distance::getWorkspaceSize(x, y, m, n, k); + if (worksize != 0) { + raft::allocate(workspace, worksize); + } auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) { return d_val <= threshold; }; - raft::distance::distance( + raft::distance::distance( x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(workspace)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(x)); CUDA_CHECK(cudaFree(y)); CUDA_CHECK(cudaFree(dist_ref)); @@ -141,13 +131,13 @@ const std::vector> inputsf = { {10.0f, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestF; -TEST_P(DistanceAdjTestF, Result) -{ +TEST_P(DistanceAdjTestF, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.01, 1024, 1024, 32, true, 1234ULL}, @@ -160,13 +150,13 @@ const std::vector> inputsd = { {10.0, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestD; -TEST_P(DistanceAdjTestD, Result) -{ +TEST_P(DistanceAdjTestD, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, + ::testing::ValuesIn(inputsd)); } // namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu index c812a1985d..10bc4d1899 100644 --- a/cpp/test/distance/dist_canberra.cu +++ b/cpp/test/distance/dist_canberra.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceCanberra : public DistanceTest { -}; +class DistanceCanberra + : public DistanceTest {}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraF; -TEST_P(DistanceCanberraF, Result) -{ +TEST_P(DistanceCanberraF, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraD; -TEST_P(DistanceCanberraD, Result) -{ +TEST_P(DistanceCanberraD, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, + ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu index 0a4a69f059..6a2b02863a 100644 --- a/cpp/test/distance/dist_chebyshev.cu +++ b/cpp/test/distance/dist_chebyshev.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceLinf : public DistanceTest { -}; +class DistanceLinf + : public DistanceTest {}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfF; -TEST_P(DistanceLinfF, Result) -{ +TEST_P(DistanceLinfF, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfD; -TEST_P(DistanceLinfD, Result) -{ +TEST_P(DistanceLinfD, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, + ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu index f7510c17b1..291c4196f9 100644 --- a/cpp/test/distance/dist_cos.cu +++ b/cpp/test/distance/dist_cos.cu @@ -21,8 +21,9 @@ namespace raft { namespace distance { template -class DistanceExpCos : public DistanceTest { -}; +class DistanceExpCos + : public DistanceTest {}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,13 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosF; -TEST_P(DistanceExpCosF, Result) -{ +TEST_P(DistanceExpCosF, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -54,13 +56,14 @@ const std::vector> inputsd = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosD; -TEST_P(DistanceExpCosD, Result) -{ +TEST_P(DistanceExpCosD, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, + ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu index e90d0e83dc..46e7ded0ec 100644 --- a/cpp/test/distance/dist_euc_exp.cu +++ b/cpp/test/distance/dist_euc_exp.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceEucExpTest : public DistanceTest { -}; +class DistanceEucExpTest + : public DistanceTest {}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,13 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestF; -TEST_P(DistanceEucExpTestF, Result) -{ +TEST_P(DistanceEucExpTestF, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -54,13 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestD; -TEST_P(DistanceEucExpTestD, Result) -{ +TEST_P(DistanceEucExpTestD, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, + ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu index 90412a9cb2..92f424647d 100644 --- a/cpp/test/distance/dist_euc_unexp.cu +++ b/cpp/test/distance/dist_euc_unexp.cu @@ -36,13 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestF; -TEST_P(DistanceEucUnexpTestF, Result) -{ +TEST_P(DistanceEucUnexpTestF, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,13 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestD; -TEST_P(DistanceEucUnexpTestD, Result) -{ +TEST_P(DistanceEucUnexpTestD, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, + ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu index 95b1908dc1..39dc7aaeff 100644 --- a/cpp/test/distance/dist_hellinger.cu +++ b/cpp/test/distance/dist_hellinger.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceHellingerExp - : public DistanceTest { -}; + : public DistanceTest {}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpF; -TEST_P(DistanceHellingerExpF, Result) -{ +TEST_P(DistanceHellingerExpF, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpD; -TEST_P(DistanceHellingerExpD, Result) -{ +TEST_P(DistanceHellingerExpD, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, + ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu index d14f8d8a0b..bd32837e45 100644 --- a/cpp/test/distance/dist_l1.cu +++ b/cpp/test/distance/dist_l1.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceUnexpL1 : public DistanceTest { -}; +class DistanceUnexpL1 + : public DistanceTest {}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1F; -TEST_P(DistanceUnexpL1F, Result) -{ +TEST_P(DistanceUnexpL1F, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1D; -TEST_P(DistanceUnexpL1D, Result) -{ +TEST_P(DistanceUnexpL1D, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, + ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu index cc6a5f60de..42b8e294ac 100644 --- a/cpp/test/distance/dist_minkowski.cu +++ b/cpp/test/distance/dist_minkowski.cu @@ -21,7 +21,8 @@ namespace raft { namespace distance { template -class DistanceLpUnexp : public DistanceTest { +class DistanceLpUnexp + : public DistanceTest { }; const std::vector> inputsf = { @@ -35,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f}, }; typedef DistanceLpUnexp DistanceLpUnexpF; -TEST_P(DistanceLpUnexpF, Result) -{ +TEST_P(DistanceLpUnexpF, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL, 4.0}, @@ -55,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0}, }; typedef DistanceLpUnexp DistanceLpUnexpD; -TEST_P(DistanceLpUnexpD, Result) -{ +TEST_P(DistanceLpUnexpD, Result) { int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, + ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index a99d307abb..fc7b064205 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -25,52 +25,43 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceKernel(DataType* dist, - const DataType* x, - const DataType* y, - int m, - int n, - int k, +__global__ void naiveDistanceKernel(DataType *dist, const DataType *x, + const DataType *y, int m, int n, int k, raft::distance::DistanceType type, - bool isRowMajor) -{ + bool isRowMajor) { int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } if (type == raft::distance::DistanceType::L2SqrtExpanded || type == raft::distance::DistanceType::L2SqrtUnexpanded) acc = raft::mySqrt(acc); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist, - const DataType* x, - const DataType* y, - int m, - int n, - int k, - raft::distance::DistanceType type, - bool isRowMajor) -{ +__global__ void naiveL1_Linf_CanberraDistanceKernel( + DataType *dist, const DataType *x, const DataType *y, int m, int n, int k, + raft::distance::DistanceType type, bool isRowMajor) { int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { return; } + if (midx >= m || nidx >= n) { + return; + } DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = (a > b) ? (a - b) : (b - a); if (type == raft::distance::DistanceType::Linf) { acc = raft::myMax(acc, diff); @@ -84,27 +75,29 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist, } } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveCosineDistanceKernel( - DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) -{ +__global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, + const DataType *y, int m, int n, + int k, bool isRowMajor) { int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { return; } + if (midx >= m || nidx >= n) { + return; + } - DataType acc_a = DataType(0); - DataType acc_b = DataType(0); + DataType acc_a = DataType(0); + DataType acc_b = DataType(0); DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_a += a * a; acc_b += b * b; acc_ab += a * b; @@ -113,74 +106,64 @@ __global__ void naiveCosineDistanceKernel( int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Use 1.0 - (cosine similarity) to calc the distance - dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); + dist[outidx] = + (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); } template -__global__ void naiveHellingerDistanceKernel( - DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) -{ +__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x, + const DataType *y, int m, int n, + int k, bool isRowMajor) { int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { return; } + if (midx >= m || nidx >= n) { + return; + } DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_ab += raft::mySqrt(a) * raft::mySqrt(b); } int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - acc_ab = 1 - acc_ab; + acc_ab = 1 - acc_ab; auto rectifier = (!signbit(acc_ab)); - dist[outidx] = raft::mySqrt(rectifier * acc_ab); + dist[outidx] = raft::mySqrt(rectifier * acc_ab); } template -__global__ void naiveLpUnexpDistanceKernel(DataType* dist, - const DataType* x, - const DataType* y, - int m, - int n, - int k, - bool isRowMajor, - DataType p) -{ +__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x, + const DataType *y, int m, int n, + int k, bool isRowMajor, DataType p) { int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = raft::L1Op()(a - b); acc += raft::myPow(diff, p); } auto one_over_p = 1 / p; - acc = raft::myPow(acc, one_over_p); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; - dist[outidx] = acc; + acc = raft::myPow(acc, one_over_p); + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; } template -void naiveDistance(DataType* dist, - const DataType* x, - const DataType* y, - int m, - int n, - int k, - raft::distance::DistanceType type, - bool isRowMajor, - DataType metric_arg = 2.0f) -{ +void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, + int n, int k, raft::distance::DistanceType type, + bool isRowMajor, DataType metric_arg = 2.0f) { static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); @@ -195,19 +178,23 @@ void naiveDistance(DataType* dist, case raft::distance::DistanceType::L2Unexpanded: case raft::distance::DistanceType::L2SqrtExpanded: case raft::distance::DistanceType::L2Expanded: - naiveDistanceKernel<<>>(dist, x, y, m, n, k, type, isRowMajor); + naiveDistanceKernel + <<>>(dist, x, y, m, n, k, type, isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - naiveCosineDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); + naiveCosineDistanceKernel + <<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - naiveHellingerDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); + naiveHellingerDistanceKernel + <<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: naiveLpUnexpDistanceKernel <<>>(dist, x, y, m, n, k, isRowMajor, metric_arg); break; - default: FAIL() << "should be here\n"; + default: + FAIL() << "should be here\n"; } CUDA_CHECK(cudaPeekAtLastError()); } @@ -222,47 +209,37 @@ struct DistanceInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const DistanceInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const DistanceInputs &dims) { return os; } template -void distanceLauncher(DataType* x, - DataType* y, - DataType* dist, - DataType* dist2, - int m, - int n, - int k, - DistanceInputs& params, - DataType threshold, - char* workspace, - size_t worksize, - cudaStream_t stream, - bool isRowMajor, - DataType metric_arg = 2.0f) -{ +void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2, + int m, int n, int k, DistanceInputs ¶ms, + DataType threshold, char *workspace, size_t worksize, + cudaStream_t stream, bool isRowMajor, + DataType metric_arg = 2.0f) { auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) { dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val; return d_val; }; raft::distance::distance( - x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); + x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, + metric_arg); } template class DistanceTest : public ::testing::TestWithParam> { public: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; DataType metric_arg = params.metric_arg; - bool isRowMajor = params.isRowMajor; + bool isRowMajor = params.isRowMajor; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); raft::allocate(x, m * k); @@ -279,33 +256,25 @@ class DistanceTest : public ::testing::TestWithParam> { r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream); } - naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, metric_arg); - char* workspace = nullptr; + naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, + metric_arg); + char *workspace = nullptr; size_t worksize = - raft::distance::getWorkspaceSize(x, y, m, n, k); - if (worksize != 0) { raft::allocate(workspace, worksize); } + raft::distance::getWorkspaceSize(x, y, m, n, k); + if (worksize != 0) { + raft::allocate(workspace, worksize); + } DataType threshold = -10000.f; - distanceLauncher(x, - y, - dist, - dist2, - m, - n, - k, - params, - threshold, - workspace, - worksize, - stream, - isRowMajor, - metric_arg); + distanceLauncher(x, y, dist, dist2, m, n, k, params, + threshold, workspace, worksize, + stream, isRowMajor, metric_arg); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(workspace)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(x)); CUDA_CHECK(cudaFree(y)); CUDA_CHECK(cudaFree(dist_ref)); diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index a7b763a2bc..4573a070b6 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -29,40 +29,40 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } + DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { + return b.value < a.value ? b : a; + } - DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } + DI KVP operator()(const KVP &a, const KVP &b) { + return b.value < a.value ? b : a; + } }; // KVPMinReduce template -__global__ void naiveKernel(cub::KeyValuePair* min, - DataT* x, - DataT* y, - int m, - int n, - int k, - int* workspace, - DataT maxVal) -{ - int midx = threadIdx.y + blockIdx.y * blockDim.y; - int nidx = threadIdx.x + blockIdx.x * blockDim.x; +__global__ void naiveKernel(cub::KeyValuePair *min, DataT *x, + DataT *y, int m, int n, int k, int *workspace, + DataT maxVal) { + int midx = threadIdx.y + blockIdx.y * blockDim.y; + int nidx = threadIdx.x + blockIdx.x * blockDim.x; DataT acc = DataT(0); for (int i = 0; i < k; ++i) { - int xidx = i + midx * k; - int yidx = i + nidx * k; + int xidx = i + midx * k; + int yidx = i + nidx * k; auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx]; acc += diff * diff; } - if (Sqrt) { acc = raft::mySqrt(acc); } + if (Sqrt) { + acc = raft::mySqrt(acc); + } ReduceOpT redOp; typedef cub::WarpReduce> WarpReduce; __shared__ typename WarpReduce::TempStorage temp[NWARPS]; int warpId = threadIdx.x / raft::WarpSize; cub::KeyValuePair tmp; - tmp.key = nidx; + tmp.key = nidx; tmp.value = midx >= m || nidx >= n ? maxVal : acc; - tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); + tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); if (threadIdx.x % raft::WarpSize == 0 && midx < m) { while (atomicCAS(workspace + midx, 0, 1) == 1) ; @@ -74,15 +74,8 @@ __global__ void naiveKernel(cub::KeyValuePair* min, } template -void naive(cub::KeyValuePair* min, - DataT* x, - DataT* y, - int m, - int n, - int k, - int* workspace, - cudaStream_t stream) -{ +void naive(cub::KeyValuePair *min, DataT *x, DataT *y, int m, int n, + int k, int *workspace, cudaStream_t stream) { static const dim3 TPB(32, 16, 1); dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1); CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); @@ -92,7 +85,8 @@ void naive(cub::KeyValuePair* min, <<>>(min, m, std::numeric_limits::max(), op); CUDA_CHECK(cudaGetLastError()); naiveKernel, 16> - <<>>(min, x, y, m, n, k, workspace, std::numeric_limits::max()); + <<>>(min, x, y, m, n, k, workspace, + std::numeric_limits::max()); CUDA_CHECK(cudaGetLastError()); } @@ -106,8 +100,7 @@ struct Inputs { template class FusedL2NNTest : public ::testing::TestWithParam> { public: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int m = params.m; @@ -128,8 +121,7 @@ class FusedL2NNTest : public ::testing::TestWithParam> { raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, true, stream); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(x)); @@ -144,38 +136,25 @@ class FusedL2NNTest : public ::testing::TestWithParam> { protected: Inputs params; DataT *x, *y, *xn, *yn; - char* workspace; - cub::KeyValuePair*min, *min_ref; + char *workspace; + cub::KeyValuePair *min, *min_ref; cudaStream_t stream; - virtual void generateGoldenResult() - { + virtual void generateGoldenResult() { int m = params.m; int n = params.n; int k = params.k; - naive(min_ref, x, y, m, n, k, (int*)workspace, stream); + naive(min_ref, x, y, m, n, k, (int *)workspace, stream); } - void runTest(cub::KeyValuePair* out) - { + void runTest(cub::KeyValuePair *out) { int m = params.m; int n = params.n; int k = params.k; MinAndDistanceReduceOp redOp; - fusedL2NN, int>(out, - x, - y, - xn, - yn, - m, - n, - k, - (void*)workspace, - redOp, - raft::distance::KVPMinReduce(), - Sqrt, - true, - stream); + fusedL2NN, int>( + out, x, y, xn, yn, m, n, k, (void *)workspace, redOp, + raft::distance::KVPMinReduce(), Sqrt, true, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } }; @@ -184,10 +163,9 @@ template struct CompareApproxAbsKVP { typedef typename cub::KeyValuePair KVP; CompareApproxAbsKVP(T eps_) : eps(eps_) {} - bool operator()(const KVP& a, const KVP& b) const - { - T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); - T m = std::max(raft::abs(a.value), raft::abs(b.value)); + bool operator()(const KVP &a, const KVP &b) const { + T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); + T m = std::max(raft::abs(a.value), raft::abs(b.value)); T ratio = m >= eps ? diff / m : diff; return (ratio <= eps); } @@ -199,20 +177,17 @@ struct CompareApproxAbsKVP { template struct CompareExactKVP { typedef typename cub::KeyValuePair KVP; - bool operator()(const KVP& a, const KVP& b) const - { + bool operator()(const KVP &a, const KVP &b) const { if (a.value != b.value) return false; return true; } }; template -::testing::AssertionResult devArrMatch(const cub::KeyValuePair* expected, - const cub::KeyValuePair* actual, - size_t size, - L eq_compare, - cudaStream_t stream = 0) -{ +::testing::AssertionResult devArrMatch(const cub::KeyValuePair *expected, + const cub::KeyValuePair *actual, + size_t size, L eq_compare, + cudaStream_t stream = 0) { typedef typename cub::KeyValuePair KVP; std::shared_ptr exp_h(new KVP[size]); std::shared_ptr act_h(new KVP[size]); @@ -224,42 +199,47 @@ template auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { return ::testing::AssertionFailure() - << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << "," - << exp.value << " @" << i; + << "actual=" << act.key << "," << act.value + << " != expected=" << exp.key << "," << exp.value << " @" << i; } } return ::testing::AssertionSuccess(); } const std::vector> inputsf = { - {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, {0.001f, 64, 32, 32, 1234ULL}, - {0.001f, 64, 64, 32, 1234ULL}, {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, + {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, + {0.001f, 64, 32, 32, 1234ULL}, {0.001f, 64, 64, 32, 1234ULL}, + {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL}, - {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, {0.001f, 64, 32, 34, 1234ULL}, - {0.001f, 64, 64, 34, 1234ULL}, {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, + {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, + {0.001f, 64, 32, 34, 1234ULL}, {0.001f, 64, 64, 34, 1234ULL}, + {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL}, - {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, {0.001f, 64, 32, 33, 1234ULL}, - {0.001f, 64, 64, 33, 1234ULL}, {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, + {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, + {0.001f, 64, 32, 33, 1234ULL}, {0.001f, 64, 64, 33, 1234ULL}, + {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL}, {0.006f, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestF_Sq; -TEST_P(FusedL2NNTestF_Sq, Result) -{ +TEST_P(FusedL2NNTestF_Sq, Result) { runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, + CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, + ::testing::ValuesIn(inputsf)); typedef FusedL2NNTest FusedL2NNTestF_Sqrt; -TEST_P(FusedL2NNTestF_Sqrt, Result) -{ +TEST_P(FusedL2NNTestF_Sqrt, Result) { runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, + CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, + ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.00001, 32, 32, 32, 1234ULL}, {0.00001, 32, 64, 32, 1234ULL}, @@ -280,38 +260,38 @@ const std::vector> inputsd = { {0.00001, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestD_Sq; -TEST_P(FusedL2NNTestD_Sq, Result) -{ +TEST_P(FusedL2NNTestD_Sq, Result) { runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, + CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, + ::testing::ValuesIn(inputsd)); typedef FusedL2NNTest FusedL2NNTestD_Sqrt; -TEST_P(FusedL2NNTestD_Sqrt, Result) -{ +TEST_P(FusedL2NNTestD_Sqrt, Result) { runTest(min); - ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch(min_ref, min, params.m, + CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, + ::testing::ValuesIn(inputsd)); /// This is to test output determinism of the prim template class FusedL2NNDetTest : public FusedL2NNTest { - void SetUp() override - { + void SetUp() override { FusedL2NNTest::SetUp(); int m = this->params.m; raft::allocate(min1, m); } - void TearDown() override - { + void TearDown() override { FusedL2NNTest::TearDown(); CUDA_CHECK(cudaFree(min1)); } protected: - cub::KeyValuePair* min1; + cub::KeyValuePair *min1; static const int NumRepeats = 100; @@ -319,46 +299,46 @@ class FusedL2NNDetTest : public FusedL2NNTest { }; typedef FusedL2NNDetTest FusedL2NNDetTestF_Sq; -TEST_P(FusedL2NNDetTestF_Sq, Result) -{ +TEST_P(FusedL2NNDetTestF_Sq, Result) { runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, + ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestF_Sqrt; -TEST_P(FusedL2NNDetTestF_Sqrt, Result) -{ +TEST_P(FusedL2NNDetTestF_Sqrt, Result) { runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, + ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sq; -TEST_P(FusedL2NNDetTestD_Sq, Result) -{ +TEST_P(FusedL2NNDetTestD_Sq, Result) { runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, + ::testing::ValuesIn(inputsd)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sqrt; -TEST_P(FusedL2NNDetTestD_Sqrt, Result) -{ +TEST_P(FusedL2NNDetTestD_Sqrt, Result) { runTest(min); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1); ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, + ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index e14841eb54..e6ee09262e 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -23,8 +23,7 @@ namespace raft { -TEST(Raft, EigenSolvers) -{ +TEST(Raft, EigenSolvers) { using namespace matrix; using index_type = int; using value_type = double; @@ -36,10 +35,10 @@ TEST(Raft, EigenSolvers) index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; - auto stream = h.get_stream(); - auto t_exe_pol = thrust::cuda::par.on(stream); + auto stream = h.get_stream(); + auto t_exe_pol = thrust::cuda::par.on(stream); sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; ASSERT_EQ(nullptr, sm1.row_offsets_); @@ -50,7 +49,7 @@ TEST(Raft, EigenSolvers) value_type tol{1.0e-10}; bool reorthog{true}; - // nullptr expected to trigger exceptions: + //nullptr expected to trigger exceptions: // value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; @@ -61,13 +60,14 @@ TEST(Raft, EigenSolvers) lanczos_solver_t eig_solver{cfg}; - EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW( + eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); - EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW( + eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); } -TEST(Raft, SpectralSolvers) -{ +TEST(Raft, SpectralSolvers) { using namespace matrix; using index_type = int; using value_type = double; @@ -82,7 +82,7 @@ TEST(Raft, SpectralSolvers) value_type tol{1.0e-10}; bool reorthog{true}; - // nullptr expected to trigger exceptions: + //nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -96,19 +96,22 @@ TEST(Raft, SpectralSolvers) index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, + seed}; kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); auto t_exe_p = thrust::cuda::par.on(stream); - sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; - EXPECT_ANY_THROW( - spectral::partition(h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + sparse_matrix_t sm{h, nullptr, nullptr, + nullptr, 0, 0}; + EXPECT_ANY_THROW(spectral::partition( + h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; - EXPECT_ANY_THROW(spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); + EXPECT_ANY_THROW( + spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 8023fca319..4cb9809844 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -22,8 +22,7 @@ namespace raft { -TEST(Raft, HandleDefault) -{ +TEST(Raft, HandleDefault) { handle_t h; ASSERT_EQ(0, h.get_num_internal_streams()); ASSERT_EQ(0, h.get_device()); @@ -34,8 +33,7 @@ TEST(Raft, HandleDefault) ASSERT_NE(nullptr, h.get_cusparse_handle()); } -TEST(Raft, Handle) -{ +TEST(Raft, Handle) { handle_t h(4); ASSERT_EQ(4, h.get_num_internal_streams()); cudaStream_t stream; @@ -46,15 +44,13 @@ TEST(Raft, Handle) CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, GetInternalStreams) -{ +TEST(Raft, GetInternalStreams) { handle_t h(4); auto streams = h.get_internal_streams(); ASSERT_EQ(4U, streams.size()); } -TEST(Raft, GetHandleFromPool) -{ +TEST(Raft, GetHandleFromPool) { handle_t parent(4); handle_t child(parent, 2); @@ -68,8 +64,7 @@ TEST(Raft, GetHandleFromPool) ASSERT_EQ(parent.get_device(), child.get_device()); } -TEST(Raft, GetHandleFromPoolPerf) -{ +TEST(Raft, GetHandleFromPoolPerf) { handle_t parent(100); auto start = curTimeMillis(); for (int i = 0; i < parent.get_num_internal_streams(); i++) { @@ -81,13 +76,13 @@ TEST(Raft, GetHandleFromPoolPerf) ASSERT_LE(curTimeMillis() - start, 10); } -TEST(Raft, GetHandleStreamViews) -{ +TEST(Raft, GetHandleStreamViews) { handle_t parent(4); handle_t child(parent, 2); ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view()); - ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value()); + ASSERT_EQ(parent.get_internal_stream_view(2).value(), + child.get_stream_view().value()); EXPECT_FALSE(child.get_stream_view().is_default()); } } // namespace raft diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp index d883de59fe..830d085a40 100644 --- a/cpp/test/integer_utils.cpp +++ b/cpp/test/integer_utils.cpp @@ -20,8 +20,7 @@ namespace raft { -TEST(Raft, rounding_up) -{ +TEST(Raft, rounding_up) { ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2); ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0); ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1); @@ -30,8 +29,7 @@ TEST(Raft, rounding_up) ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1); } -TEST(Raft, is_a_power_of_two) -{ +TEST(Raft, is_a_power_of_two) { ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true); ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false); } diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index 209bb0355a..dc2846fdba 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -36,8 +36,7 @@ class labelTest : public ::testing::Test { }; typedef labelTest MakeMonotonicTest; -TEST_F(MakeMonotonicTest, Result) -{ +TEST_F(MakeMonotonicTest, Result) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -49,14 +48,17 @@ TEST_F(MakeMonotonicTest, Result) raft::allocate(actual, m, true); raft::allocate(expected, m, true); - float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; + float *data_h = + new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; - float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; + float *expected_h = + new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; raft::update_device(data, data_h, m, stream); raft::update_device(expected, expected_h, m, stream); - std::shared_ptr allocator(new raft::mr::device::default_allocator); + std::shared_ptr allocator( + new raft::mr::device::default_allocator); make_monotonic(actual, data, m, stream, allocator); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -71,36 +73,37 @@ TEST_F(MakeMonotonicTest, Result) delete expected_h; } -TEST(labelTest, Classlabels) -{ +TEST(labelTest, Classlabels) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr allocator(new raft::mr::device::default_allocator); + std::shared_ptr allocator( + new raft::mr::device::default_allocator); int n_rows = 6; - float* y_d; + float *y_d; raft::allocate(y_d, n_rows); float y_h[] = {2, -1, 1, 2, 1, 1}; raft::update_device(y_d, y_h, n_rows, stream); int n_classes; - float* y_unique_d; + float *y_unique_d; getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator); ASSERT_EQ(n_classes, 3); float y_unique_exp[] = {-1, 1, 2}; - EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, raft::Compare(), stream)); + EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, + raft::Compare(), stream)); - float* y_relabeled_d; + float *y_relabeled_d; raft::allocate(y_relabeled_d, n_rows); getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream); float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1}; - EXPECT_TRUE( - devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, raft::Compare(), stream)); + EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, + raft::Compare(), stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(y_d)); diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index 3d930ff22e..a2f14a8dbc 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -39,7 +39,8 @@ struct MergeLabelsInputs { }; template -class MergeLabelsTest : public ::testing::TestWithParam> { +class MergeLabelsTest + : public ::testing::TestWithParam> { protected: MergeLabelsTest() : params(::testing::TestWithParam>::GetParam()), @@ -49,23 +50,25 @@ class MergeLabelsTest : public ::testing::TestWithParam(params.mask.data()), params.N, stream); - - merge_labels( - labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream); + m(1, stream) {} + + void Run() { + raft::update_device(labels_a.data(), params.labels_a.data(), params.N, + stream); + raft::update_device(labels_b.data(), params.labels_b.data(), params.N, + stream); + raft::update_device(expected.data(), params.expected.data(), params.N, + stream); + raft::update_device(mask.data(), + reinterpret_cast(params.mask.data()), params.N, + stream); + + merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(), + m.data(), params.N, stream); cudaStreamSynchronize(stream); - ASSERT_TRUE(raft::devArrMatch( - expected.data(), labels_a.data(), params.N, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(expected.data(), labels_a.data(), + params.N, raft::Compare())); } protected: @@ -82,14 +85,22 @@ TEST_P(MergeLabelsTestI, Result) { Run(); } using MergeLabelsTestL = MergeLabelsTest; TEST_P(MergeLabelsTestL, Result) { Run(); } -constexpr int MAX32 = std::numeric_limits::max(); +constexpr int MAX32 = std::numeric_limits::max(); constexpr int64_t MAX64 = std::numeric_limits::max(); const std::vector> merge_inputs_32 = { {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, - {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, + {6, + {1, 2, 1, 4, 5, MAX32}, + {1, 2, MAX32, 4, 5, 4}, + {1, 1, 0, 1, 1, 0}, + {1, 2, 1, 4, 5, 4}}, + {6, + {1, 2, 2, 2, 2, 6}, + {1, 1, 1, 5, 5, 5}, + {1, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX32, 1, 3, MAX32}, {1, 2, 3, 2, MAX32, 2, 2, 2}, @@ -105,8 +116,16 @@ const std::vector> merge_inputs_32 = { const std::vector> merge_inputs_64 = { {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, - {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, + {6, + {1, 2, 1, 4, 5, MAX64}, + {1, 2, MAX64, 4, 5, 4}, + {1, 1, 0, 1, 1, 0}, + {1, 2, 1, 4, 5, 4}}, + {6, + {1, 2, 2, 2, 2, 6}, + {1, 1, 1, 5, 5, 5}, + {1, 1, 1, 1, 1, 1}, + {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX64, 1, 3, MAX64}, {1, 2, 3, 2, MAX64, 2, 2, 2}, @@ -119,8 +138,10 @@ const std::vector> merge_inputs_64 = { {1, 1, 1, 1, 1, 7, 7, 7}}, }; -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32)); -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, + ::testing::ValuesIn(merge_inputs_32)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, + ::testing::ValuesIn(merge_inputs_64)); } // namespace label } // namespace raft diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu index 61c7182c72..04f473f836 100644 --- a/cpp/test/lap/lap.cu +++ b/cpp/test/lap/lap.cu @@ -29,11 +29,11 @@ #include #include -#define PROBLEMSIZE 1000 // Number of rows/columns -#define BATCHSIZE 10 // Number of problems in the batch -#define COSTRANGE 1000 +#define PROBLEMSIZE 1000 // Number of rows/columns +#define BATCHSIZE 10 // Number of problems in the batch +#define COSTRANGE 1000 #define PROBLEMCOUNT 1 -#define REPETITIONS 1 +#define REPETITIONS 1 #define SEED 01010001 @@ -43,43 +43,41 @@ namespace raft { // Function for generating problem with uniformly distributed integer costs between [0, COSTRANGE]. template -void generateProblem(weight_t* cost_matrix, int SP, int N, int costrange) -{ +void generateProblem(weight_t *cost_matrix, int SP, int N, int costrange) { long N2 = SP * N * N; std::uniform_int_distribution distribution(0, costrange); for (long i = 0; i < N2; i++) { - int val = distribution(generator); + int val = distribution(generator); cost_matrix[i] = (weight_t)val; } } template -void hungarian_test(int problemsize, - int costrange, - int problemcount, - int repetitions, - int batchsize, - weight_t epsilon, - bool verbose = false) -{ +void hungarian_test(int problemsize, int costrange, int problemcount, + int repetitions, int batchsize, weight_t epsilon, + bool verbose = false) { raft::handle_t handle; - weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize]; + weight_t *h_cost = new weight_t[batchsize * problemsize * problemsize]; for (int j = 0; j < problemcount; j++) { generateProblem(h_cost, batchsize, problemsize, costrange); raft::mr::device::buffer elements_v( - handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize * problemsize); + handle.get_device_allocator(), handle.get_stream(), + batchsize * problemsize * problemsize); raft::mr::device::buffer row_assignment_v( - handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize); + handle.get_device_allocator(), handle.get_stream(), + batchsize * problemsize); raft::mr::device::buffer col_assignment_v( - handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize); + handle.get_device_allocator(), handle.get_stream(), + batchsize * problemsize); - raft::update_device( - elements_v.data(), h_cost, batchsize * problemsize * problemsize, handle.get_stream()); + raft::update_device(elements_v.data(), h_cost, + batchsize * problemsize * problemsize, + handle.get_stream()); for (int i = 0; i < repetitions; i++) { float start = omp_get_wtime(); @@ -89,18 +87,20 @@ void hungarian_test(int problemsize, handle, problemsize, batchsize, epsilon); // Solve LAP(s) for given cost matrix - lpx.solve(elements_v.data(), row_assignment_v.data(), col_assignment_v.data()); + lpx.solve(elements_v.data(), row_assignment_v.data(), + col_assignment_v.data()); float end = omp_get_wtime(); float total_time = (end - start); if (verbose) { - // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual - // objectives. At optimality both values should match. + // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual objectives. At optimality both values should match. for (int k = 0; k < batchsize; k++) { - std::cout << j << ":" << i << ":" << k << ":" << lpx.getPrimalObjectiveValue(k) << ":" - << lpx.getDualObjectiveValue(k) << ":" << total_time << std::endl; + std::cout << j << ":" << i << ":" << k << ":" + << lpx.getPrimalObjectiveValue(k) << ":" + << lpx.getDualObjectiveValue(k) << ":" << total_time + << std::endl; } } } @@ -109,38 +109,34 @@ void hungarian_test(int problemsize, delete[] h_cost; } -TEST(Raft, HungarianIntFloat) -{ - hungarian_test( - PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianIntFloat) { + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, + BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianIntDouble) -{ - hungarian_test( - PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianIntDouble) { + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, + BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianIntLong) -{ - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); +TEST(Raft, HungarianIntLong) { + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, + BATCHSIZE, long{0}); } -TEST(Raft, HungarianLongFloat) -{ - hungarian_test( - PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianLongFloat) { + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, + BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianLongDouble) -{ - hungarian_test( - PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianLongDouble) { + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, + REPETITIONS, BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianLongLong) -{ - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); +TEST(Raft, HungarianLongLong) { + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, + BATCHSIZE, long{0}); } } // namespace raft diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index 38e189f27e..2fc9d4e30f 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -27,8 +27,7 @@ namespace linalg { template class AddTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -43,8 +42,7 @@ class AddTest : public ::testing::TestWithParam> { add(out, in1, in2, len, stream); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); @@ -53,10 +51,9 @@ class AddTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void compare() - { - ASSERT_TRUE( - raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); + void compare() { + ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); } protected: diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh index 1d9352bfc1..137419758f 100644 --- a/cpp/test/linalg/add.cuh +++ b/cpp/test/linalg/add.cuh @@ -23,17 +23,18 @@ namespace raft { namespace linalg { template -__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len) -{ +__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2, + int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); } + if (idx < len) { + out[idx] = OutT(in1[idx] + in2[idx]); + } } template -void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len) -{ +void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) { static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -46,8 +47,8 @@ struct AddInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const AddInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const AddInputs &dims) { return os; } diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index 078c41356a..3ae4f86066 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -29,19 +29,20 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void binaryOpLaunch( - OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) -{ +void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2, + IdxType len, cudaStream_t stream) { binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, + stream); } template -class BinaryOpTest : public ::testing::TestWithParam> { +class BinaryOpTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = ::testing::TestWithParam< + BinaryOpInputs>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; @@ -58,8 +59,7 @@ class BinaryOpTest : public ::testing::TestWithParam> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i32 = { + {0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32; -TEST_P(BinaryOpTestF_i32, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, + ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i64 = { + {0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i64; -TEST_P(BinaryOpTestF_i64, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, + ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32_D; -TEST_P(BinaryOpTestF_i32_D, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32_D, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, + ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd_i32 = { + {0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i32; -TEST_P(BinaryOpTestD_i32, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, + ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i64; -TEST_P(BinaryOpTestD_i64, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, + ::testing::ValuesIn(inputsd_i64)); template class BinaryOpAlignment : public ::testing::Test { protected: - BinaryOpAlignment() - { + BinaryOpAlignment() { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void Misaligned() - { + void Misaligned() { // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly // chosen. int n = 1024; @@ -136,12 +142,8 @@ class BinaryOpAlignment : public ::testing::Test { CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); raft::linalg::binaryOp( - z.data() + 9, - x.data() + 137, - y.data() + 19, - 256, - [] __device__(math_t x, math_t y) { return x + y; }, - stream); + z.data() + 9, x.data() + 137, y.data() + 19, 256, + [] __device__(math_t x, math_t y) { return x + y; }, stream); } raft::handle_t handle; diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh index 97cb3ecb24..fd8ed6dd1e 100644 --- a/cpp/test/linalg/binary_op.cuh +++ b/cpp/test/linalg/binary_op.cuh @@ -24,17 +24,18 @@ namespace raft { namespace linalg { template -__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len) -{ +__global__ void naiveAddKernel(OutType *out, const InType *in1, + const InType *in2, IdxType len) { IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); - if (idx < len) { out[idx] = static_cast(in1[idx] + in2[idx]); } + if (idx < len) { + out[idx] = static_cast(in1[idx] + in2[idx]); + } } template -void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len) -{ +void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) { static const IdxType TPB = 64; - IdxType nblks = raft::ceildiv(len, TPB); + IdxType nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -47,8 +48,8 @@ struct BinaryOpInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs& d) -{ +::std::ostream &operator<<(::std::ostream &os, + const BinaryOpInputs &d) { return os; } diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 5bbe3166cf..00236d53fa 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -36,8 +36,7 @@ class CholeskyR1Test : public ::testing::Test { L(allocator, handle.get_stream(), n_rows * n_rows), L_exp(allocator, handle.get_stream(), n_rows * n_rows), devInfo(allocator, handle.get_stream(), 1), - workspace(allocator, handle.get_stream()) - { + workspace(allocator, handle.get_stream()) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(G.data(), G_host, n_rows * n_rows, stream); @@ -49,58 +48,55 @@ class CholeskyR1Test : public ::testing::Test { int n_bytes = 0; // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace // requirements. - raft::linalg::choleskyRank1Update( - handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); + raft::linalg::choleskyRank1Update(handle, L.data(), n_rows, n_rows, nullptr, + &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes); workspace.resize(Lwork, stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testR1Update() - { + void testR1Update() { int n = n_rows * n_rows; - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, + CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), n, stream); for (int rank = 1; rank <= n_rows; rank++) { std::stringstream ss; - ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); + ss << "Rank " << rank + << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); SCOPED_TRACE(ss.str()); // Expected solution using Cholesky factorization from scratch raft::copy(L_exp.data(), G.data(), n, stream); - CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle, - uplo, - rank, - L_exp.data(), - n_rows, - (math_t*)workspace.data(), - Lwork, - devInfo.data(), - stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf( + solver_handle, uplo, rank, L_exp.data(), n_rows, + (math_t*)workspace.data(), Lwork, devInfo.data(), stream)); // Incremental Cholesky factorization using rank one updates. - raft::linalg::choleskyRank1Update( - handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream); + raft::linalg::choleskyRank1Update(handle, L.data(), rank, n_rows, + workspace.data(), &Lwork, uplo, + stream); - ASSERT_TRUE(raft::devArrMatch( - L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox(3e-3))); + ASSERT_TRUE(raft::devArrMatch(L_exp.data(), L.data(), n_rows * rank, + raft::CompareApprox(3e-3))); } } } - void testR1Error() - { + void testR1Error() { raft::update_device(G.data(), G2_host, 4, stream); - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, + CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), 4, stream); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream)); - ASSERT_THROW(raft::linalg::choleskyRank1Update( - handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), - raft::exception); + ASSERT_THROW( + raft::linalg::choleskyRank1Update( + handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), + raft::exception); math_t eps = std::numeric_limits::epsilon(); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index 2760d522bc..e45f5651b4 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -33,8 +33,8 @@ struct coalescedReductionInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const coalescedReductionInputs &dims) { return os; } @@ -42,18 +42,17 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void coalescedReductionLaunch( - T* dots, const T* data, int cols, int rows, cudaStream_t stream, bool inplace = false) -{ - coalescedReduction( - dots, data, cols, rows, (T)0, stream, inplace, [] __device__(T in, int i) { return in * in; }); +void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows, + cudaStream_t stream, bool inplace = false) { + coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace, + [] __device__(T in, int i) { return in * in; }); } template -class coalescedReductionTest : public ::testing::TestWithParam> { +class coalescedReductionTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; @@ -74,8 +73,7 @@ class coalescedReductionTest : public ::testing::TestWithParam> inputsf = {{0.000002f, 1024, 32, 1234ULL}, - {0.000002f, 1024, 64, 1234ULL}, - {0.000002f, 1024, 128, 1234ULL}, - {0.000002f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = { + {0.000002f, 1024, 32, 1234ULL}, + {0.000002f, 1024, 64, 1234ULL}, + {0.000002f, 1024, 128, 1234ULL}, + {0.000002f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = { + {0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef coalescedReductionTest coalescedReductionTestF; -TEST_P(coalescedReductionTestF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, + raft::CompareApprox(params.tolerance))); } typedef coalescedReductionTest coalescedReductionTestD; -TEST_P(coalescedReductionTestD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, - coalescedReductionTestF, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, - coalescedReductionTestD, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index d8995ffa0a..2396558939 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -25,27 +25,30 @@ namespace raft { namespace linalg { template -__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len) -{ +__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar, + int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = in[idx] / scalar; } + if (idx < len) { + out[idx] = in[idx] / scalar; + } } template -void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) -{ +void naiveDivide(Type *out, const Type *in, Type scalar, int len, + cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveDivideKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } template -class DivideTest : public ::testing::TestWithParam> { +class DivideTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = + ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; cudaStream_t stream; @@ -60,8 +63,7 @@ class DivideTest : public ::testing::TestWithParam> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef DivideTest DivideTestF; -TEST_P(DivideTestF, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); +TEST_P(DivideTestF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, + ::testing::ValuesIn(inputsf)); typedef DivideTest DivideTestD; -const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(DivideTestD, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(DivideTestD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, + ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 5cad657dab..159d288174 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -35,16 +35,14 @@ struct EigInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const EigInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const EigInputs &dims) { return os; } template class EigTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { raft::handle_t handle; stream = handle.get_stream(); @@ -53,8 +51,8 @@ class EigTest : public ::testing::TestWithParam> { int len = params.len; raft::allocate(cov_matrix, len); - T cov_matrix_h[] = { - 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, + 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); @@ -63,23 +61,10 @@ class EigTest : public ::testing::TestWithParam> { raft::allocate(eig_vectors_jacobi, len); raft::allocate(eig_vals_jacobi, params.n_col); - T eig_vectors_ref_h[] = {0.2790, - -0.6498, - 0.6498, - -0.2789, - -0.5123, - 0.4874, - 0.4874, - -0.5123, - 0.6498, - 0.2789, - -0.2789, - -0.6498, - 0.4874, - 0.5123, - 0.5123, - 0.4874}; - T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; + T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874, + 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, + 0.4874, 0.5123, 0.5123, 0.4874}; + T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; raft::allocate(eig_vectors_ref, len); raft::allocate(eig_vals_ref, params.n_col); @@ -87,19 +72,13 @@ class EigTest : public ::testing::TestWithParam> { raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream); - eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals, stream); + eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals, + stream); - T tol = 1.e-7; + T tol = 1.e-7; int sweeps = 15; - eigJacobi(handle, - cov_matrix, - params.n_row, - params.n_col, - eig_vectors_jacobi, - eig_vals_jacobi, - stream, - tol, - sweeps); + eigJacobi(handle, cov_matrix, params.n_row, params.n_col, + eig_vectors_jacobi, eig_vals_jacobi, stream, tol, sweeps); // test code for comparing two methods len = params.n * params.n; @@ -111,20 +90,14 @@ class EigTest : public ::testing::TestWithParam> { r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream); - eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large, eig_vals_large, stream); - eigJacobi(handle, - cov_matrix_large, - params.n, - params.n, - eig_vectors_jacobi_large, - eig_vals_jacobi_large, - stream, - tol, + eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large, + eig_vals_large, stream); + eigJacobi(handle, cov_matrix_large, params.n, params.n, + eig_vectors_jacobi_large, eig_vals_jacobi_large, stream, tol, sweeps); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(cov_matrix)); CUDA_CHECK(cudaFree(eig_vectors)); CUDA_CHECK(cudaFree(eig_vectors_jacobi)); @@ -136,95 +109,89 @@ class EigTest : public ::testing::TestWithParam> { protected: EigInputs params; - T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals, *eig_vals_jacobi, - *eig_vals_ref; + T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals, + *eig_vals_jacobi, *eig_vals_ref; - T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large, *eig_vals_large, - *eig_vals_jacobi_large; + T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large, + *eig_vals_large, *eig_vals_jacobi_large; cudaStream_t stream; }; -const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = { + {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = { + {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigTest EigTestValF; -TEST_P(EigTestValF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValD; -TEST_P(EigTestValD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecF; -TEST_P(EigTestVecF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecD; -TEST_P(EigTestVecD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiF; -TEST_P(EigTestValJacobiF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiD; -TEST_P(EigTestValJacobiD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiF; -TEST_P(EigTestVecJacobiF, Result) -{ - ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref, - eig_vectors_jacobi, - params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiD; -TEST_P(EigTestVecJacobiD, Result) -{ - ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref, - eig_vectors_jacobi, - params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareF; -TEST_P(EigTestVecCompareF, Result) -{ - ASSERT_TRUE(raft::devArrMatch(eig_vectors_large, - eig_vectors_jacobi_large, - (params.n * params.n), - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareF, Result) { + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareD; -TEST_P(EigTestVecCompareD, Result) -{ - ASSERT_TRUE(raft::devArrMatch(eig_vectors_large, - eig_vectors_jacobi_large, - (params.n * params.n), - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareD, Result) { + ASSERT_TRUE(raft::devArrMatch( + eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2)); @@ -235,13 +202,17 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, + ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, + ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, + ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, + ::testing::ValuesIn(inputsd2)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index b3cfb19174..b3980f281d 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -37,44 +37,32 @@ struct EigSelInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const EigSelInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const EigSelInputs &dims) { return os; } template class EigSelTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { raft::handle_t handle; stream = handle.get_stream(); - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); int len = params.len; raft::allocate(cov_matrix, len); - T cov_matrix_h[] = { - 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, + 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix, cov_matrix_h, len, stream); raft::allocate(eig_vectors, 12); raft::allocate(eig_vals, params.n_col); - T eig_vectors_ref_h[] = {-0.5123, - 0.4874, - 0.4874, - -0.5123, - 0.6498, - 0.2789, - -0.2789, - -0.6498, - 0.4874, - 0.5123, - 0.5123, - 0.4874}; - T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; + T eig_vectors_ref_h[] = {-0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, + -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; + T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; raft::allocate(eig_vectors_ref, 12); raft::allocate(eig_vals_ref, params.n_col); @@ -82,19 +70,11 @@ class EigSelTest : public ::testing::TestWithParam> { raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream); - eigSelDC(handle, - cov_matrix, - params.n_row, - params.n_col, - 3, - eig_vectors, - eig_vals, - EigVecMemUsage::OVERWRITE_INPUT, - stream); + eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors, + eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(cov_matrix)); CUDA_CHECK(cudaFree(eig_vectors)); CUDA_CHECK(cudaFree(eig_vals)); @@ -109,45 +89,51 @@ class EigSelTest : public ::testing::TestWithParam> { cudaStream_t stream; }; -const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = { + {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = { + {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigSelTest EigSelTestValF; -TEST_P(EigSelTestValF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestValD; -TEST_P(EigSelTestValD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecF; -TEST_P(EigSelTestVecF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecF, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors, 12, + raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecD; -TEST_P(EigSelTestVecD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecD, Result) { + ASSERT_TRUE( + raft::devArrMatch(eig_vectors_ref, eig_vectors, 12, + raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, + ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, + ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, + ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, + ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index f0e04403e8..572951c557 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -26,17 +26,19 @@ namespace linalg { //// Testing unary ops template -__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len) -{ +__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar, + int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = scalar * in[idx]; } + if (idx < len) { + out[idx] = scalar * in[idx]; + } } template -void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) -{ +void naiveScale(Type *out, const Type *in, Type scalar, int len, + cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveScaleKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -50,19 +52,19 @@ struct ScalarMultiplyInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const ScalarMultiplyInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const ScalarMultiplyInputs &dims) { return os; } template -class ScalarMultiplyTest : public ::testing::TestWithParam> { +class ScalarMultiplyTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int len = params.len; + int len = params.len; T scalar = params.scalar; cudaStream_t stream; @@ -76,8 +78,7 @@ class ScalarMultiplyTest : public ::testing::TestWithParam> inputsf1 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf1 = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; const std::vector> inputsd1 = { {0.00000001, 1024 * 1024, 2.0, 1234ULL}}; typedef ScalarMultiplyTest ScalarMultiplyTestF; -TEST_P(ScalarMultiplyTestF, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } typedef ScalarMultiplyTest ScalarMultiplyTestD; -TEST_P(ScalarMultiplyTestD, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, + ::testing::ValuesIn(inputsf1)); -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::ValuesIn(inputsd1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, + ::testing::ValuesIn(inputsd1)); //// Testing binary ops template -__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len) -{ +__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2, + int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = in1[idx] + in2[idx]; } + if (idx < len) { + out[idx] = in1[idx] + in2[idx]; + } } template -void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) -{ +void naiveAdd(Type *out, const Type *in1, const Type *in2, int len, + cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -135,16 +141,15 @@ struct EltwiseAddInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const EltwiseAddInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const EltwiseAddInputs &dims) { return os; } template class EltwiseAddTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -162,8 +167,7 @@ class EltwiseAddTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(out_ref)); @@ -175,25 +179,29 @@ class EltwiseAddTest : public ::testing::TestWithParam> { T *in1, *in2, *out_ref, *out; }; -const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = { + {0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = { + {0.00000001, 1024 * 1024, 1234ULL}}; typedef EltwiseAddTest EltwiseAddTestF; -TEST_P(EltwiseAddTestF, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } typedef EltwiseAddTest EltwiseAddTestD; -TEST_P(EltwiseAddTestD, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, + ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, + ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu index e95dbbc502..cecfc5eb8e 100644 --- a/cpp/test/linalg/gemm_layout.cu +++ b/cpp/test/linalg/gemm_layout.cu @@ -36,9 +36,9 @@ struct GemmLayoutInputs { // Reference GEMM implementation. template -__global__ void naiveGemm( - T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor) -{ +__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, + bool isZColMajor, bool isXColMajor, + bool isYColMajor) { int tidx = blockIdx.x * blockDim.x + threadIdx.x; int tidy = blockIdx.y * blockDim.y + threadIdx.y; @@ -51,7 +51,7 @@ __global__ void naiveGemm( temp += X[xIndex] * Y[yIndex]; } int zIndex = isZColMajor ? m + n * M : m * N + n; - Z[zIndex] = temp; + Z[zIndex] = temp; } } } @@ -59,8 +59,7 @@ __global__ void naiveGemm( template class GemmLayoutTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; @@ -73,8 +72,8 @@ class GemmLayoutTest : public ::testing::TestWithParam> { // Dimensions of Y : K x N // Dimensions of Z : M x N - T* X = NULL; // Argument X - T* Y = NULL; // Argument Y + T *X = NULL; // Argument X + T *Y = NULL; // Argument Y size_t xElems = params.M * params.K; size_t yElems = params.K * params.N; @@ -88,35 +87,27 @@ class GemmLayoutTest : public ::testing::TestWithParam> { r.uniform(X, xElems, T(-10.0), T(10.0), stream); r.uniform(Y, yElems, T(-10.0), T(10.0), stream); - dim3 blocks(raft::ceildiv(params.M, 128), raft::ceildiv(params.N, 4), 1); + dim3 blocks(raft::ceildiv(params.M, 128), + raft::ceildiv(params.N, 4), 1); dim3 threads(128, 4, 1); - naiveGemm<<>>( - refZ, X, Y, params.M, params.N, params.K, params.zLayout, params.xLayout, params.yLayout); - - gemm(handle, - Z, - X, - Y, - params.M, - params.N, - params.K, - params.zLayout, - params.xLayout, - params.yLayout, - stream); + naiveGemm<<>>(refZ, X, Y, params.M, params.N, params.K, + params.zLayout, params.xLayout, + params.yLayout); + + gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout, + params.xLayout, params.yLayout, stream); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(refZ)); CUDA_CHECK(cudaFree(Z)); } protected: GemmLayoutInputs params; - T* refZ = NULL; // Reference result for comparison - T* Z = NULL; // Computed result + T *refZ = NULL; // Reference result for comparison + T *Z = NULL; // Computed result }; const std::vector> inputsf = { @@ -140,20 +131,22 @@ const std::vector> inputsd = { {50, 80, 60, false, false, false, 893038ULL}}; typedef GemmLayoutTest GemmLayoutTestF; -TEST_P(GemmLayoutTestF, Result) -{ - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-4))); +TEST_P(GemmLayoutTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, + raft::CompareApprox(1e-4))); } typedef GemmLayoutTest GemmLayoutTestD; -TEST_P(GemmLayoutTestD, Result) -{ - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-6))); +TEST_P(GemmLayoutTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, + raft::CompareApprox(1e-6))); } -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, + ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, + ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 0e33d9758f..227bce6a48 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -26,22 +26,13 @@ namespace raft { namespace linalg { template -void mapLaunch(OutType* out, - const InType* in1, - const InType* in2, - const InType* in3, - InType scalar, - IdxType len, - cudaStream_t stream) -{ +void mapLaunch(OutType *out, const InType *in1, const InType *in2, + const InType *in3, InType scalar, IdxType len, + cudaStream_t stream) { map( - out, - len, + out, len, [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; }, - stream, - in1, - in2, - in3); + stream, in1, in2, in3); } template @@ -53,15 +44,10 @@ struct MapInputs { }; template -void create_ref(OutType* out_ref, - const InType* in1, - const InType* in2, - const InType* in3, - InType scalar, - IdxType len, - cudaStream_t stream) -{ - InType* tmp; +void create_ref(OutType *out_ref, const InType *in1, const InType *in2, + const InType *in3, InType scalar, IdxType len, + cudaStream_t stream) { + InType *tmp; allocate(tmp, len); eltwiseAdd(tmp, in1, in2, len, stream); eltwiseAdd(out_ref, tmp, in3, len, stream); @@ -70,11 +56,12 @@ void create_ref(OutType* out_ref, } template -class MapTest : public ::testing::TestWithParam> { +class MapTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = + ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; @@ -94,8 +81,7 @@ class MapTest : public ::testing::TestWithParam> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}}; +const std::vector> inputsf_i32 = { + {0.000001f, 1024 * 1024, 1234ULL, 3.2}}; typedef MapTest MapTestF_i32; -TEST_P(MapTestF_i32, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, + ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}}; +const std::vector> inputsf_i64 = { + {0.000001f, 1024 * 1024, 1234ULL, 9.4}}; typedef MapTest MapTestF_i64; -TEST_P(MapTestF_i64, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(MapTestF_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, + ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL, 5.9}}; typedef MapTest MapTestF_i32_D; -TEST_P(MapTestF_i32_D, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32_D, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, + ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}}; +const std::vector> inputsd_i32 = { + {0.00000001, 1024 * 1024, 1234ULL, 7.5}}; typedef MapTest MapTestD_i32; -TEST_P(MapTestD_i32, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(MapTestD_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, + ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL, 5.2}}; typedef MapTest MapTestD_i64; -TEST_P(MapTestD_i64, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(MapTestD_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, + ::testing::ValuesIn(inputsd_i64)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index a1b82e7644..6e146fa4bb 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -25,18 +25,21 @@ namespace raft { namespace linalg { template -__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map) -{ +__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len, + MapOp map) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); } + if (idx < len) { + raft::myAtomicAdd(out, (OutType)map(in[idx])); + } } template -void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaStream_t stream) -{ +void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map, + cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(len, (size_t)TPB); - naiveMapReduceKernel<<>>(out, in, len, map); + int nblks = raft::ceildiv(len, (size_t)TPB); + naiveMapReduceKernel + <<>>(out, in, len, map); CUDA_CHECK(cudaPeekAtLastError()); } @@ -48,8 +51,7 @@ struct MapReduceInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const MapReduceInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs &dims) { return os; } @@ -57,9 +59,8 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void mapReduceLaunch( - OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream) -{ +void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in, + size_t len, cudaStream_t stream) { auto op = [] __device__(InType in) { return in; }; naiveMapReduce(out_ref, in, len, op, stream); mapThenSumReduce(out, len, op, 0, in); @@ -68,8 +69,7 @@ void mapReduceLaunch( template class MapReduceTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); auto len = params.len; @@ -84,8 +84,7 @@ class MapReduceTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -93,44 +92,48 @@ class MapReduceTest : public ::testing::TestWithParam> { protected: MapReduceInputs params; - InType* in; + InType *in; OutType *out_ref, *out; }; -const std::vector> inputsf = {{0.001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = { + {0.001f, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestFF; -TEST_P(MapReduceTestFF, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, + ::testing::ValuesIn(inputsf)); typedef MapReduceTest MapReduceTestFD; -TEST_P(MapReduceTestFD, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, + ::testing::ValuesIn(inputsf)); -const std::vector> inputsd = {{0.000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = { + {0.000001, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestDD; -TEST_P(MapReduceTestDD, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox(params.tolerance))); +TEST_P(MapReduceTestDD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, + ::testing::ValuesIn(inputsd)); template class MapGenericReduceTest : public ::testing::Test { - using InType = typename T::first_type; + using InType = typename T::first_type; using OutType = typename T::second_type; protected: MapGenericReduceTest() : allocator(handle.get_device_allocator()), input(allocator, handle.get_stream(), n), - output(allocator, handle.get_stream(), 1) - { + output(allocator, handle.get_stream(), 1) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); @@ -139,8 +142,7 @@ class MapGenericReduceTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void initInput(InType* input, int n, cudaStream_t stream) - { + void initInput(InType *input, int n, cudaStream_t stream) { raft::random::Rng r(137); r.uniform(input, n, InType(2), InType(3), stream); InType val = 1; @@ -149,19 +151,21 @@ class MapGenericReduceTest : public ::testing::Test { raft::update_device(input + 337, &val, 1, stream); } - void testMin() - { - auto op = [] __device__(InType in) { return in; }; + void testMin() { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::max(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, + input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, + raft::Compare())); } - void testMax() - { - auto op = [] __device__(InType in) { return in; }; + void testMax() { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::min(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, + input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, + raft::Compare())); } protected: @@ -174,7 +178,8 @@ class MapGenericReduceTest : public ::testing::Test { }; using IoTypePair = - ::testing::Types, std::pair, std::pair>; + ::testing::Types, std::pair, + std::pair>; TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair); TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); } diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 6ad9bfba10..aa46c78b0f 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -32,8 +32,8 @@ struct MatVecOpInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const MatVecOpInputs &dims) { return os; } @@ -41,48 +41,26 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void matrixVectorOpLaunch(T* out, - const T* in, - const T* vec1, - const T* vec2, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - bool useTwoVectors, - cudaStream_t stream) -{ +void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, bool useTwoVectors, + cudaStream_t stream) { if (useTwoVectors) { matrixVectorOp( - out, - in, - vec1, - vec2, - D, - N, - rowMajor, - bcastAlongRows, - [] __device__(T a, T b, T c) { return a + b + c; }, - stream); + out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows, + [] __device__(T a, T b, T c) { return a + b + c; }, stream); } else { matrixVectorOp( - out, - in, - vec1, - D, - N, - rowMajor, - bcastAlongRows, - [] __device__(T a, T b) { return a + b; }, - stream); + out, in, vec1, D, N, rowMajor, bcastAlongRows, + [] __device__(T a, T b) { return a + b; }, stream); } } template -class MatVecOpTest : public ::testing::TestWithParam> { +class MatVecOpTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); IdxType N = params.rows, D = params.cols; @@ -100,25 +78,18 @@ class MatVecOpTest : public ::testing::TestWithParam> r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream); r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream); if (params.useTwoVectors) { - naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor, + params.bcastAlongRows, (T)1.0); } else { - naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor, + params.bcastAlongRows, (T)1.0); } - matrixVectorOpLaunch(out, - in, - vec1, - vec2, - D, - N, - params.rowMajor, - params.bcastAlongRows, - params.useTwoVectors, - stream); + matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor, + params.bcastAlongRows, params.useTwoVectors, stream); CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(vec1)); CUDA_CHECK(cudaFree(vec2)); CUDA_CHECK(cudaFree(out)); @@ -150,23 +121,23 @@ const std::vector> inputsf_i32 = { {0.00001f, 1024, 32, false, false, true, 1234ULL}, {0.00001f, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i32; -TEST_P(MatVecOpTestF_i32, Result) -{ - ASSERT_TRUE( - devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, + ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.00001f, 2500, 250, false, false, false, 1234ULL}, {0.00001f, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i64; -TEST_P(MatVecOpTestF_i64, Result) -{ - ASSERT_TRUE( - devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, + ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, true, true, false, 1234ULL}, @@ -187,23 +158,23 @@ const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, false, false, true, 1234ULL}, {0.0000001, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i32; -TEST_P(MatVecOpTestD_i32, Result) -{ - ASSERT_TRUE( - devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestD_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, + ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.0000001, 2500, 250, false, false, false, 1234ULL}, {0.0000001, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i64; -TEST_P(MatVecOpTestD_i64, Result) -{ - ASSERT_TRUE( - devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestD_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, + ::testing::ValuesIn(inputsd_i64)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh index 5f9c6f1ef3..69c45c9866 100644 --- a/cpp/test/linalg/matrix_vector_op.cuh +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -22,15 +22,9 @@ namespace raft { namespace linalg { template -__global__ void naiveMatVecKernel(Type* out, - const Type* mat, - const Type* vec, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Type scalar) -{ +__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, + IdxType D, IdxType N, bool rowMajor, + bool bcastAlongRows, Type scalar) { IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -43,37 +37,27 @@ __global__ void naiveMatVecKernel(Type* out, } else { col = idx / N; } - if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; } + if (idx < len) { + out[idx] = mat[idx] + scalar * vec[col]; + } } template -void naiveMatVec(Type* out, - const Type* mat, - const Type* vec, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Type scalar) -{ +void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D, + IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) { static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel<<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel + <<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveMatVecKernel(Type* out, - const Type* mat, - const Type* vec1, - const Type* vec2, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Type scalar) -{ +__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, + const Type *vec2, IdxType D, IdxType N, + bool rowMajor, bool bcastAlongRows, + Type scalar) { IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -86,25 +70,20 @@ __global__ void naiveMatVecKernel(Type* out, } else { col = idx / N; } - if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; } + if (idx < len) { + out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; + } } template -void naiveMatVec(Type* out, - const Type* mat, - const Type* vec1, - const Type* vec2, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Type scalar) -{ +void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2, + IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, + Type scalar) { static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel - <<>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel<<>>(out, mat, vec1, vec2, D, N, rowMajor, + bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index 6c38d89891..1d3e753de3 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -27,8 +27,7 @@ namespace linalg { template class MultiplyTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -44,8 +43,7 @@ class MultiplyTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out_ref)); CUDA_CHECK(cudaFree(out)); @@ -56,21 +54,25 @@ class MultiplyTest : public ::testing::TestWithParam> { T *in, *out_ref, *out; }; -const std::vector> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef MultiplyTest MultiplyTestF; -TEST_P(MultiplyTestF, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); +TEST_P(MultiplyTestF, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, + ::testing::ValuesIn(inputsf)); typedef MultiplyTest MultiplyTestD; -const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(MultiplyTestD, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(MultiplyTestD, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, + ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index 35bc72dee4..acc25addd0 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -34,19 +34,17 @@ struct NormInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const NormInputs& I) -{ - os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", " - << I.do_sqrt << ", " << I.seed << '}' << std::endl; +::std::ostream &operator<<(::std::ostream &os, const NormInputs &I) { + os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " + << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl; return os; } ///// Row-wise norm test definitions template -__global__ void naiveRowNormKernel( - Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) -{ - Type acc = (Type)0; +__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, + NormType type, bool do_sqrt) { + Type acc = (Type)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { @@ -61,20 +59,19 @@ __global__ void naiveRowNormKernel( } template -void naiveRowNorm( - Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) -{ +void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type, + bool do_sqrt, cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveRowNormKernel<<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(N, TPB); + naiveRowNormKernel + <<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } template class RowNormTest : public ::testing::TestWithParam> { public: - void SetUp() override - { + void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -85,18 +82,19 @@ class RowNormTest : public ::testing::TestWithParam> { raft::allocate(dots_exp, rows); raft::allocate(dots_act, rows); r.uniform(data, len, T(-1.0), T(1.0), stream); - naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); + naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, + stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op); + rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, + fin_op); } else { rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -111,11 +109,10 @@ class RowNormTest : public ::testing::TestWithParam> { ///// Column-wise norm test definitisons template -__global__ void naiveColNormKernel( - Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) -{ +__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, + NormType type, bool do_sqrt) { int colID = threadIdx.x + blockIdx.x * blockDim.x; - if (colID > D) return; // avoid out-of-bounds thread + if (colID > D) return; //avoid out-of-bounds thread Type acc = 0; for (int i = 0; i < N; i++) { @@ -127,20 +124,19 @@ __global__ void naiveColNormKernel( } template -void naiveColNorm( - Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) -{ +void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type, + bool do_sqrt, cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(D, TPB); - naiveColNormKernel<<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(D, TPB); + naiveColNormKernel + <<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } template class ColNormTest : public ::testing::TestWithParam> { public: - void SetUp() override - { + void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -152,18 +148,19 @@ class ColNormTest : public ::testing::TestWithParam> { raft::allocate(dots_exp, cols); raft::allocate(dots_act, cols); - naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream); + naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, + stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op); + colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, + fin_op); } else { colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -177,23 +174,24 @@ class ColNormTest : public ::testing::TestWithParam> { }; ///// Row- and column-wise tests -const std::vector> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, - - {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; +const std::vector> inputsf = { + {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, + + {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; const std::vector> inputsd = { {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL}, @@ -215,22 +213,22 @@ const std::vector> inputsd = { {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}}; typedef RowNormTest RowNormTestF; -TEST_P(RowNormTestF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, + raft::CompareApprox(params.tolerance))); } typedef RowNormTest RowNormTestD; -TEST_P(RowNormTestD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - dots_exp, dots_act, params.rows, raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, + ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, + ::testing::ValuesIn(inputsd)); const std::vector> inputscf = { {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL}, @@ -271,22 +269,22 @@ const std::vector> inputscd = { {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}}; typedef ColNormTest ColNormTestF; -TEST_P(ColNormTestF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols, + raft::CompareApprox(params.tolerance))); } typedef ColNormTest ColNormTestD; -TEST_P(ColNormTestD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, + ::testing::ValuesIn(inputscf)); -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, + ::testing::ValuesIn(inputscd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 85c84777e4..9082397265 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -34,8 +34,8 @@ struct ReduceInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const ReduceInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const ReduceInputs &dims) { return os; } @@ -43,55 +43,45 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void reduceLaunch(OutType* dots, - const InType* data, - int cols, - int rows, - bool rowMajor, - bool alongRows, - bool inplace, - cudaStream_t stream) -{ - reduce(dots, - data, - cols, - rows, - (OutType)0, - rowMajor, - alongRows, - stream, - inplace, - [] __device__(InType in, int i) { return static_cast(in * in); }); +void reduceLaunch(OutType *dots, const InType *data, int cols, int rows, + bool rowMajor, bool alongRows, bool inplace, + cudaStream_t stream) { + reduce( + dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace, + [] __device__(InType in, int i) { return static_cast(in * in); }); } template -class ReduceTest : public ::testing::TestWithParam> { +class ReduceTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); - params = ::testing::TestWithParam>::GetParam(); + params = + ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; - outlen = params.alongRows ? rows : cols; + outlen = params.alongRows ? rows : cols; raft::allocate(data, len); raft::allocate(dots_exp, outlen); raft::allocate(dots_act, outlen); r.uniform(data, len, InType(-1.0), InType(1.0), stream); - naiveReduction(dots_exp, data, cols, rows, params.rowMajor, params.alongRows, stream); + naiveReduction(dots_exp, data, cols, rows, params.rowMajor, + params.alongRows, stream); // Perform reduction with default inplace = false first - reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, false, stream); + reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, + false, stream); // Add to result with inplace = true next, which shouldn't affect // in the case of coalescedReduction! if (!(params.rowMajor ^ params.alongRows)) { - reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, true, stream); + reduceLaunch(dots_act, data, cols, rows, params.rowMajor, + params.alongRows, true, stream); } } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(dots_exp)); CUDA_CHECK(cudaFree(dots_act)); @@ -100,7 +90,7 @@ class ReduceTest : public ::testing::TestWithParam protected: ReduceInputs params; - InType* data; + InType *data; OutType *dots_exp, *dots_act; int outlen; cudaStream_t stream; @@ -161,31 +151,31 @@ const std::vector> inputsfd = { {0.000002f, 1024, 256, false, false, 1234ULL}}; typedef ReduceTest ReduceTestFF; -TEST_P(ReduceTestFF, Result) -{ - ASSERT_TRUE( - devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFF, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, + raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestDD; -TEST_P(ReduceTestDD, Result) -{ - ASSERT_TRUE( - devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestDD, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, + raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestFD; -TEST_P(ReduceTestFD, Result) -{ - ASSERT_TRUE( - devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFD, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, + ::testing::ValuesIn(inputsff)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, + ::testing::ValuesIn(inputsdd)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, + ::testing::ValuesIn(inputsfd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 86f9c2d8b8..30a9c2e271 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -26,69 +26,52 @@ namespace raft { namespace linalg { template -__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N) -{ - OutType acc = (OutType)0; +__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data, + int D, int N) { + OutType acc = (OutType)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { - acc += static_cast(data[rowStart * D + i] * data[rowStart * D + i]); + acc += + static_cast(data[rowStart * D + i] * data[rowStart * D + i]); } dots[rowStart] = 2 * acc; } } template -void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) -{ +void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N, + cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveCoalescedReductionKernel<<>>(dots, data, D, N); + int nblks = raft::ceildiv(N, TPB); + naiveCoalescedReductionKernel + <<>>(dots, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); } template -void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) -{ - // computes a MLCommon unary op on data (squares it), then computes Ax +void unaryAndGemv(OutType *dots, const InType *data, int D, int N, + cudaStream_t stream) { + //computes a MLCommon unary op on data (squares it), then computes Ax //(A input matrix and x column vector) to sum columns thrust::device_vector sq(D * N); raft::linalg::unaryOp( - thrust::raw_pointer_cast(sq.data()), - data, - D * N, - [] __device__(InType v) { return static_cast(v * v); }, - stream); + thrust::raw_pointer_cast(sq.data()), data, D * N, + [] __device__(InType v) { return static_cast(v * v); }, stream); cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle)); - thrust::device_vector ones(N, 1); // column vector [1...1] + thrust::device_vector ones(N, 1); //column vector [1...1] OutType alpha = 1, beta = 0; - CUBLAS_CHECK(raft::linalg::cublasgemv(handle, - CUBLAS_OP_N, - D, - N, - &alpha, - thrust::raw_pointer_cast(sq.data()), - D, - thrust::raw_pointer_cast(ones.data()), - 1, - &beta, - dots, - 1, - stream)); + CUBLAS_CHECK(raft::linalg::cublasgemv( + handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D, + thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream)); CUDA_CHECK(cudaDeviceSynchronize()); CUBLAS_CHECK(cublasDestroy(handle)); } template -void naiveReduction(OutType* dots, - const InType* data, - int D, - int N, - bool rowMajor, - bool alongRows, - cudaStream_t stream) -{ +void naiveReduction(OutType *dots, const InType *data, int D, int N, + bool rowMajor, bool alongRows, cudaStream_t stream) { if (rowMajor && alongRows) { naiveCoalescedReduction(dots, data, D, N, stream); } else if (rowMajor && !alongRows) { diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index 57699cb050..b27fa2ac1a 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -32,17 +32,17 @@ struct stridedReductionInputs { }; template -void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream) -{ - stridedReduction( - dots, data, cols, rows, (T)0, stream, false, [] __device__(T in, int i) { return in * in; }); +void stridedReductionLaunch(T *dots, const T *data, int cols, int rows, + cudaStream_t stream) { + stridedReduction(dots, data, cols, rows, (T)0, stream, false, + [] __device__(T in, int i) { return in * in; }); } template -class stridedReductionTest : public ::testing::TestWithParam> { +class stridedReductionTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -50,17 +50,16 @@ class stridedReductionTest : public ::testing::TestWithParam> inputsf = {{0.00001f, 1024, 32, 1234ULL}, - {0.00001f, 1024, 64, 1234ULL}, - {0.00001f, 1024, 128, 1234ULL}, - {0.00001f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = { + {0.00001f, 1024, 32, 1234ULL}, + {0.00001f, 1024, 64, 1234ULL}, + {0.00001f, 1024, 128, 1234ULL}, + {0.00001f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = { + {0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef stridedReductionTest stridedReductionTestF; -TEST_P(stridedReductionTestF, Result) -{ - ASSERT_TRUE( - devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestF, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols, + raft::CompareApprox(params.tolerance))); } typedef stridedReductionTest stridedReductionTestD; -TEST_P(stridedReductionTestD, Result) -{ - ASSERT_TRUE( - devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestD, Result) { + ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, + ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, + ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index 4295b91f3e..ced3f65fdd 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -24,34 +24,39 @@ namespace raft { namespace linalg { template -__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len) -{ +__global__ void naiveSubtractElemKernel(Type *out, const Type *in1, + const Type *in2, int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = in1[idx] - in2[idx]; } + if (idx < len) { + out[idx] = in1[idx] - in2[idx]; + } } template -void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) -{ +void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len, + cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveSubtractElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len) -{ +__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1, + const Type in2, int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = in1[idx] - in2; } + if (idx < len) { + out[idx] = in1[idx] - in2; + } } template -void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream) -{ +void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len, + cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveSubtractScalarKernel<<>>(out, in1, in2, len); + int nblks = raft::ceildiv(len, TPB); + naiveSubtractScalarKernel + <<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -63,16 +68,14 @@ struct SubtractInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SubtractInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const SubtractInputs &dims) { return os; } template class SubtractTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -95,8 +98,7 @@ class SubtractTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); CUDA_CHECK(cudaFree(out_ref)); @@ -108,33 +110,35 @@ class SubtractTest : public ::testing::TestWithParam> { T *in1, *in2, *out_ref, *out; }; -const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = { + {0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = { + {0.00000001, 1024 * 1024, 1234ULL}}; typedef SubtractTest SubtractTestF; -TEST_P(SubtractTestF, Result) -{ - ASSERT_TRUE( - raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); - ASSERT_TRUE( - raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len, + raft::CompareApprox(params.tolerance))); } typedef SubtractTest SubtractTestD; -TEST_P(SubtractTestD, Result) -{ - ASSERT_TRUE( - raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len, + raft::CompareApprox(params.tolerance))); - ASSERT_TRUE( - raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, + ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, + ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index e9e1a6dc02..fff321768f 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -35,21 +35,19 @@ struct SvdInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SvdInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const SvdInputs &dims) { return os; } template class SvdTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { raft::handle_t handle; params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); - int len = params.len; + int len = params.len; cudaStream_t stream = handle.get_stream(); raft::allocate(data, len); @@ -58,7 +56,7 @@ class SvdTest : public ::testing::TestWithParam> { T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0}; raft::update_device(data, data_h, len, stream); - int left_evl = params.n_row * params.n_col; + int left_evl = params.n_row * params.n_col; int right_evl = params.n_col * params.n_col; raft::allocate(left_eig_vectors_qr, left_evl); @@ -69,7 +67,8 @@ class SvdTest : public ::testing::TestWithParam> { // allocate(right_eig_vectors_trans_jacobi, right_evl); // allocate(sing_vals_jacobi, params.n_col); - T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, 0.488195, 0.110706, -0.865685}; + T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, + 0.488195, 0.110706, -0.865685}; T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636}; @@ -79,25 +78,18 @@ class SvdTest : public ::testing::TestWithParam> { raft::allocate(right_eig_vectors_ref, right_evl); raft::allocate(sing_vals_ref, params.n_col); - raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, stream); - raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h, right_evl, stream); + raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, + stream); + raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h, + right_evl, stream); raft::update_device(sing_vals_ref, sing_vals_ref_h, params.n_col, stream); - svdQR(handle, - data, - params.n_row, - params.n_col, - sing_vals_qr, - left_eig_vectors_qr, - right_eig_vectors_trans_qr, - true, - true, - true, + svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr, + left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true, stream); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(left_eig_vectors_qr)); CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr)); @@ -109,71 +101,69 @@ class SvdTest : public ::testing::TestWithParam> { protected: SvdInputs params; - T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, *left_eig_vectors_ref, - *right_eig_vectors_ref, *sing_vals_ref; + T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, + *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref; }; -const std::vector> inputsf2 = {{0.00001f, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsf2 = { + {0.00001f, 3 * 2, 3, 2, 1234ULL}}; -const std::vector> inputsd2 = {{0.00001, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsd2 = { + {0.00001, 3 * 2, 3, 2, 1234ULL}}; typedef SvdTest SvdTestValF; -TEST_P(SvdTestValF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValF, Result) { + ASSERT_TRUE( + raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestValD; -TEST_P(SvdTestValD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValD, Result) { + ASSERT_TRUE( + raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecF; -TEST_P(SvdTestLeftVecF, Result) -{ - ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref, - left_eig_vectors_qr, - params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecF, Result) { + ASSERT_TRUE(raft::devArrMatch( + left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecD; -TEST_P(SvdTestLeftVecD, Result) -{ - ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref, - left_eig_vectors_qr, - params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecD, Result) { + ASSERT_TRUE(raft::devArrMatch( + left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecF; -TEST_P(SvdTestRightVecF, Result) -{ - ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref, - right_eig_vectors_trans_qr, - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecF, Result) { + ASSERT_TRUE( + raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr, + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecD; -TEST_P(SvdTestRightVecD, Result) -{ - ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref, - right_eig_vectors_trans_qr, - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecD, Result) { + ASSERT_TRUE( + raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr, + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, + ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, + ::testing::ValuesIn(inputsd2)); // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, // ::testing::ValuesIn(inputsf2)); diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index 659bed04c6..f10b029962 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -34,16 +34,14 @@ struct TranposeInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const TranposeInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const TranposeInputs &dims) { return os; } template class TransposeTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); stream = handle.get_stream(); @@ -65,8 +63,7 @@ class TransposeTest : public ::testing::TestWithParam> { transpose(data, params.n_row, stream); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(data_trans)); CUDA_CHECK(cudaFree(data_trans_ref)); @@ -79,33 +76,39 @@ class TransposeTest : public ::testing::TestWithParam> { cudaStream_t stream; }; -const std::vector> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsf2 = { + {0.1f, 3 * 3, 3, 3, 1234ULL}}; -const std::vector> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsd2 = { + {0.1, 3 * 3, 3, 3, 1234ULL}}; typedef TransposeTest TransposeTestValF; -TEST_P(TransposeTestValF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - data_trans_ref, data_trans, params.len, raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE(raft::devArrMatch( - data_trans_ref, data, params.len, raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValF, Result) { + ASSERT_TRUE( + raft::devArrMatch(data_trans_ref, data_trans, params.len, + raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE( + raft::devArrMatch(data_trans_ref, data, params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef TransposeTest TransposeTestValD; -TEST_P(TransposeTestValD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - data_trans_ref, data_trans, params.len, raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE(raft::devArrMatch( - data_trans_ref, data, params.len, raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValD, Result) { + ASSERT_TRUE( + raft::devArrMatch(data_trans_ref, data_trans, params.len, + raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE( + raft::devArrMatch(data_trans_ref, data, params.len, + raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, + ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, + ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 6349a1907a..666ab8619d 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -28,25 +28,28 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) -{ +void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len, + cudaStream_t stream) { if (in == nullptr) { auto op = [scalar] __device__(OutType * ptr, IdxType idx) { *ptr = static_cast(scalar * idx); }; writeOnlyUnaryOp(out, len, op, stream); } else { - auto op = [scalar] __device__(InType in) { return static_cast(in * scalar); }; + auto op = [scalar] __device__(InType in) { + return static_cast(in * scalar); + }; unaryOp(out, in, len, op, stream); } } template -class UnaryOpTest : public ::testing::TestWithParam> { +class UnaryOpTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = ::testing::TestWithParam< + UnaryOpInputs>::GetParam(); raft::random::Rng r(params.seed); CUDA_CHECK(cudaStreamCreate(&stream)); auto len = params.len; @@ -56,8 +59,7 @@ class UnaryOpTest : public ::testing::TestWithParam(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); } UnaryOpInputs params; - InType* in; + InType *in; OutType *out_ref, *out; cudaStream_t stream; }; @@ -84,15 +86,14 @@ class UnaryOpTest : public ::testing::TestWithParam class WriteOnlyUnaryOpTest : public UnaryOpTest { protected: - void DoTest() override - { - auto len = this->params.len; + void DoTest() override { + auto len = this->params.len; auto scalar = this->params.scalar; - naiveScale(this->out_ref, (OutType*)nullptr, scalar, len, this->stream); - unaryOpLaunch(this->out, (OutType*)nullptr, scalar, len, this->stream); + naiveScale(this->out_ref, (OutType *)nullptr, scalar, len, this->stream); + unaryOpLaunch(this->out, (OutType *)nullptr, scalar, len, this->stream); CUDA_CHECK(cudaStreamSynchronize(this->stream)); - ASSERT_TRUE(devArrMatch( - this->out_ref, this->out, this->params.len, CompareApprox(this->params.tolerance))); + ASSERT_TRUE(devArrMatch(this->out_ref, this->out, this->params.len, + CompareApprox(this->params.tolerance))); } }; @@ -100,7 +101,8 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest { TEST_P(Name, Result) { DoTest(); } \ INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs)) -const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf_i32 = { + {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef UnaryOpTest UnaryOpTestF_i32; UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32); typedef WriteOnlyUnaryOpTest WriteOnlyUnaryOpTestF_i32; diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh index 3343389af8..be3f1124c5 100644 --- a/cpp/test/linalg/unary_op.cuh +++ b/cpp/test/linalg/unary_op.cuh @@ -24,8 +24,8 @@ namespace raft { namespace linalg { template -__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len) -{ +__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, + IdxType len) { IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); if (idx < len) { if (in == nullptr) { @@ -38,11 +38,12 @@ __global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, } template -void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStream_t stream) -{ +void naiveScale(OutType *out, const InType *in, InType scalar, int len, + cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveScaleKernel<<>>(out, in, scalar, len); + int nblks = raft::ceildiv(len, TPB); + naiveScaleKernel + <<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -55,8 +56,8 @@ struct UnaryOpInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const UnaryOpInputs& d) -{ +::std::ostream &operator<<(::std::ostream &os, + const UnaryOpInputs &d) { return os; } diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 9cdd36b252..578139623a 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -24,51 +24,53 @@ namespace raft { namespace matrix { template -__global__ void nativePowerKernel(Type* in, Type* out, int len) -{ +__global__ void nativePowerKernel(Type *in, Type *out, int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = in[idx] * in[idx]; } + if (idx < len) { + out[idx] = in[idx] * in[idx]; + } } template -void naivePower(Type* in, Type* out, int len, cudaStream_t stream) -{ +void naivePower(Type *in, Type *out, int len, cudaStream_t stream) { static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativePowerKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void nativeSqrtKernel(Type* in, Type* out, int len) -{ +__global__ void nativeSqrtKernel(Type *in, Type *out, int len) { int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { out[idx] = sqrt(in[idx]); } + if (idx < len) { + out[idx] = sqrt(in[idx]); + } } template -void naiveSqrt(Type* in, Type* out, int len) -{ +void naiveSqrt(Type *in, Type *out, int len) { static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativeSqrtKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount) -{ +__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, + int colCount) { int d_i = blockIdx.x * rowCount; int end = d_i + rowCount; if (blockIdx.x < colCount) { - Type max = 0.0; + Type max = 0.0; int max_index = 0; for (int i = d_i; i < end; i++) { Type val = in[i]; - if (val < 0.0) { val = -val; } + if (val < 0.0) { + val = -val; + } if (val > max) { - max = val; + max = val; max_index = i; } } @@ -86,8 +88,7 @@ __global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCo } template -void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount) -{ +void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) { naiveSignFlipKernel<<>>(in, out, rowCount, colCount); CUDA_CHECK(cudaPeekAtLastError()); } @@ -102,16 +103,14 @@ struct MathInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const MathInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const MathInputs &dims) { return os; } template class MathTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); random::Rng r(params.seed); int len = params.len; @@ -155,7 +154,7 @@ class MathTest : public ::testing::TestWithParam> { allocate(in_recip_ref, 4); allocate(out_recip, 4); // default threshold is 1e-15 - std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; + std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; std::vector in_recip_ref_h = {10.0, 100.0, -100.0, 0.0}; update_device(in_recip, in_recip_h.data(), 4, stream); update_device(in_recip_ref, in_recip_ref_h.data(), 4, stream); @@ -166,7 +165,7 @@ class MathTest : public ::testing::TestWithParam> { reciprocal(in_recip, recip_scalar, 4, stream, true); - std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; + std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; std::vector in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1}; allocate(in_smallzero, 4); allocate(out_smallzero, 4); @@ -178,8 +177,7 @@ class MathTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(in_power)); CUDA_CHECK(cudaFree(out_power_ref)); CUDA_CHECK(cudaFree(in_sqrt)); @@ -198,129 +196,137 @@ class MathTest : public ::testing::TestWithParam> { protected: MathInputs params; - T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, *out_ratio_ref, *in_sign_flip, - *out_sign_flip_ref, *in_recip, *in_recip_ref, *out_recip, *in_smallzero, *out_smallzero, - *out_smallzero_ref; + T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, + *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref, + *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref; }; -const std::vector> inputsf = {{0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = { + {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd = {{0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = { + {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; typedef MathTest MathPowerTestF; -TEST_P(MathPowerTestF, Result) -{ - ASSERT_TRUE( - devArrMatch(in_power, out_power_ref, params.len, CompareApprox(params.tolerance))); +TEST_P(MathPowerTestF, Result) { + ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathPowerTestD; -TEST_P(MathPowerTestD, Result) -{ - ASSERT_TRUE( - devArrMatch(in_power, out_power_ref, params.len, CompareApprox(params.tolerance))); +TEST_P(MathPowerTestD, Result) { + ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestF; -TEST_P(MathSqrtTestF, Result) -{ - ASSERT_TRUE( - devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestF, Result) { + ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestD; -TEST_P(MathSqrtTestD, Result) -{ - ASSERT_TRUE( - devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestD, Result) { + ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestF; -TEST_P(MathRatioTestF, Result) -{ - ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox(params.tolerance))); +TEST_P(MathRatioTestF, Result) { + ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, + CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestD; -TEST_P(MathRatioTestD, Result) -{ - ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox(params.tolerance))); +TEST_P(MathRatioTestD, Result) { + ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, + CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestF; -TEST_P(MathSignFlipTestF, Result) -{ - ASSERT_TRUE(devArrMatch( - in_sign_flip, out_sign_flip_ref, params.len, CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestF, Result) { + ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestD; -TEST_P(MathSignFlipTestD, Result) -{ - ASSERT_TRUE(devArrMatch( - in_sign_flip, out_sign_flip_ref, params.len, CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestD, Result) { + ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestF; -TEST_P(MathReciprocalTestF, Result) -{ - ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestF, Result) { + ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, + CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, + CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestD; -TEST_P(MathReciprocalTestD, Result) -{ - ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestD, Result) { + ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, + CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, + CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestF; -TEST_P(MathSetSmallZeroTestF, Result) -{ - ASSERT_TRUE( - devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestF, Result) { + ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4, + CompareApprox(params.tolerance))); - ASSERT_TRUE( - devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4, + CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestD; -TEST_P(MathSetSmallZeroTestD, Result) -{ - ASSERT_TRUE( - devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestD, Result) { + ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4, + CompareApprox(params.tolerance))); - ASSERT_TRUE( - devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, + ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, + ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, + ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, + ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, + ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, + ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, + ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, + ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, + ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, + ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, + ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, + ::testing::ValuesIn(inputsd)); } // namespace matrix } // namespace raft diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index fc5a418bda..28222c0697 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -32,16 +32,14 @@ struct MatrixInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const MatrixInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const MatrixInputs &dims) { return os; } template class MatrixTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.n_row * params.n_col; @@ -56,14 +54,13 @@ class MatrixTest : public ::testing::TestWithParam> { // copy(in1, in1_revr, params.n_row, params.n_col); // colReverse(in1_revr, params.n_row, params.n_col); - T* outTrunc; + T *outTrunc; raft::allocate(outTrunc, 6); truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream); CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(in1)); CUDA_CHECK(cudaFree(in2)); // CUDA_CHECK(cudaFree(in1_revr)); @@ -76,30 +73,31 @@ class MatrixTest : public ::testing::TestWithParam> { const std::vector> inputsf2 = {{0.000001f, 4, 4, 1234ULL}}; -const std::vector> inputsd2 = {{0.00000001, 4, 4, 1234ULL}}; +const std::vector> inputsd2 = { + {0.00000001, 4, 4, 1234ULL}}; typedef MatrixTest MatrixTestF; -TEST_P(MatrixTestF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - in1, in2, params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); +TEST_P(MatrixTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col, + raft::CompareApprox(params.tolerance))); } typedef MatrixTest MatrixTestD; -TEST_P(MatrixTestD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - in1, in2, params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); +TEST_P(MatrixTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, + ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, + ::testing::ValuesIn(inputsd2)); template class MatrixCopyRowsTest : public ::testing::Test { - using math_t = typename std::tuple_element<0, T>::type; - using idx_t = typename std::tuple_element<1, T>::type; + using math_t = typename std::tuple_element<0, T>::type; + using idx_t = typename std::tuple_element<1, T>::type; using idx_array_t = typename std::tuple_element<2, T>::type; protected: @@ -107,38 +105,42 @@ class MatrixCopyRowsTest : public ::testing::Test { : allocator(handle.get_device_allocator()), input(allocator, handle.get_stream(), n_cols * n_rows), indices(allocator, handle.get_stream(), n_selected), - output(allocator, handle.get_stream(), n_cols * n_selected) - { + output(allocator, handle.get_stream(), n_cols * n_selected) { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(indices.data(), indices_host, n_selected, stream); // Init input array thrust::counting_iterator first(0); thrust::device_ptr ptr(input.data()); - thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, ptr); + thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, + ptr); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testCopyRows() - { - copyRows( - input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false); - EXPECT_TRUE(raft::devArrMatchHost( - output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare())); - copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true); - EXPECT_TRUE(raft::devArrMatchHost( - output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare())); + void testCopyRows() { + copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), + n_selected, stream, false); + EXPECT_TRUE(raft::devArrMatchHost(output_exp_colmajor, output.data(), + n_selected * n_cols, + raft::Compare())); + copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), + n_selected, stream, true); + EXPECT_TRUE(raft::devArrMatchHost(output_exp_rowmajor, output.data(), + n_selected * n_cols, + raft::Compare())); } protected: - int n_rows = 10; - int n_cols = 3; + int n_rows = 10; + int n_cols = 3; int n_selected = 5; - idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; - math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, 17, 19, 20, 23, 24, 27, 29}; - math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, 14, 21, 22, 23, 27, 28, 29}; + idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; + math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, + 17, 19, 20, 23, 24, 27, 29}; + math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, + 14, 21, 22, 23, 27, 28, 29}; raft::handle_t handle; cudaStream_t stream; std::shared_ptr allocator; @@ -147,10 +149,10 @@ class MatrixCopyRowsTest : public ::testing::Test { raft::mr::device::buffer indices; }; -using TypeTuple = ::testing::Types, - std::tuple, - std::tuple, - std::tuple>; +using TypeTuple = + ::testing::Types, std::tuple, + std::tuple, + std::tuple>; TYPED_TEST_CASE(MatrixCopyRowsTest, TypeTuple); TYPED_TEST(MatrixCopyRowsTest, CopyRows) { this->testCopyRows(); } diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp index 9ba2c3332b..223efdbfe8 100644 --- a/cpp/test/mr/device/buffer.cpp +++ b/cpp/test/mr/device/buffer.cpp @@ -25,8 +25,7 @@ namespace raft { namespace mr { namespace device { -TEST(Raft, DeviceBufferAlloc) -{ +TEST(Raft, DeviceBufferAlloc) { auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -53,14 +52,13 @@ TEST(Raft, DeviceBufferAlloc) CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceBufferZeroResize) -{ +TEST(Raft, DeviceBufferZeroResize) { // Create a limiting_resource_adaptor to track allocations - auto curr_mr = - dynamic_cast(rmm::mr::get_current_device_resource()); - auto limit_mr = - std::make_shared>(curr_mr, - 1000); + auto curr_mr = dynamic_cast( + rmm::mr::get_current_device_resource()); + auto limit_mr = std::make_shared< + rmm::mr::limiting_resource_adaptor>(curr_mr, + 1000); rmm::mr::set_current_device_resource(limit_mr.get()); diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp index aadf05285c..953f65ddfb 100644 --- a/cpp/test/mr/host/buffer.cpp +++ b/cpp/test/mr/host/buffer.cpp @@ -24,8 +24,7 @@ namespace raft { namespace mr { namespace host { -TEST(Raft, HostBuffer) -{ +TEST(Raft, HostBuffer) { auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -52,14 +51,14 @@ TEST(Raft, HostBuffer) CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceToHostBuffer) -{ +TEST(Raft, DeviceToHostBuffer) { auto d_alloc = std::make_shared(); auto h_alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); device::buffer d_buff(d_alloc, stream, 32); - CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); + CUDA_CHECK( + cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); buffer h_buff(h_alloc, d_buff); ASSERT_EQ(d_buff.size(), h_buff.size()); CUDA_CHECK(cudaStreamSynchronize(stream)); diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index 5560c61c73..d7aa76500b 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -54,8 +54,7 @@ namespace mst { // Sequential prims function // Returns total weight of MST template -weight_t prims(CSRHost& csr_h) -{ +weight_t prims(CSRHost &csr_h) { auto n_vertices = csr_h.offsets.size() - 1; bool active_vertex[n_vertices]; @@ -64,18 +63,19 @@ weight_t prims(CSRHost& csr_h) for (auto i = 0; i < n_vertices; i++) { active_vertex[i] = false; - curr_edge[i] = INT_MAX; + curr_edge[i] = INT_MAX; } curr_edge[0] = 0; // function to pick next min vertex-edge - auto min_vertex_edge = [](auto* curr_edge, auto* active_vertex, auto n_vertices) { + auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex, + auto n_vertices) { weight_t min = INT_MAX; vertex_t min_vertex; for (auto v = 0; v < n_vertices; v++) { if (!active_vertex[v] && curr_edge[v] < min) { - min = curr_edge[v]; + min = curr_edge[v]; min_vertex = v; } } @@ -91,13 +91,14 @@ weight_t prims(CSRHost& csr_h) active_vertex[curr_v] = true; // set to active // iterate through edges of current active vertex - auto edge_st = csr_h.offsets[curr_v]; + auto edge_st = csr_h.offsets[curr_v]; auto edge_end = csr_h.offsets[curr_v + 1]; for (auto e = edge_st; e < edge_end; e++) { // put edges to be considered for next iteration auto neighbor_idx = csr_h.indices[e]; - if (!active_vertex[neighbor_idx] && csr_h.weights[e] < curr_edge[neighbor_idx]) { + if (!active_vertex[neighbor_idx] && + csr_h.weights[e] < curr_edge[neighbor_idx]) { curr_edge[neighbor_idx] = csr_h.weights[e]; } } @@ -113,101 +114,99 @@ weight_t prims(CSRHost& csr_h) } template -class MSTTest : public ::testing::TestWithParam> { +class MSTTest + : public ::testing::TestWithParam> { protected: std::pair, raft::Graph_COO> - mst_gpu() - { - edge_t* offsets = static_cast(csr_d.offsets.data()); - vertex_t* indices = static_cast(csr_d.indices.data()); - weight_t* weights = static_cast(csr_d.weights.data()); + mst_gpu() { + edge_t *offsets = static_cast(csr_d.offsets.data()); + vertex_t *indices = static_cast(csr_d.indices.data()); + weight_t *weights = static_cast(csr_d.weights.data()); v = static_cast((csr_d.offsets.size() / sizeof(vertex_t)) - 1); e = static_cast(csr_d.indices.size() / sizeof(edge_t)); - rmm::device_vector mst_src(2 * v - 2, std::numeric_limits::max()); - rmm::device_vector mst_dst(2 * v - 2, std::numeric_limits::max()); + rmm::device_vector mst_src(2 * v - 2, + std::numeric_limits::max()); + rmm::device_vector mst_dst(2 * v - 2, + std::numeric_limits::max()); rmm::device_vector color(v, 0); - vertex_t* color_ptr = thrust::raw_pointer_cast(color.data()); + vertex_t *color_ptr = thrust::raw_pointer_cast(color.data()); if (iterations == 0) { MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), + true, true, 0); auto symmetric_result = symmetric_solver.solve(); MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), + false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), + std::move(non_symmetric_result)); } else { - MST_solver intermediate_solver(handle, - offsets, - indices, - weights, - v, - e, - color_ptr, - handle.get_stream(), - true, - true, - iterations); + MST_solver intermediate_solver( + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), + true, true, iterations); auto intermediate_result = intermediate_solver.solve(); MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, false, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), + true, false, 0); auto symmetric_result = symmetric_solver.solve(); // symmetric_result.n_edges += intermediate_result.n_edges; - auto total_edge_size = symmetric_result.n_edges + intermediate_result.n_edges; + auto total_edge_size = + symmetric_result.n_edges + intermediate_result.n_edges; symmetric_result.src.resize(total_edge_size, handle.get_stream()); symmetric_result.dst.resize(total_edge_size, handle.get_stream()); symmetric_result.weights.resize(total_edge_size, handle.get_stream()); raft::copy(symmetric_result.src.data() + symmetric_result.n_edges, - intermediate_result.src.data(), - intermediate_result.n_edges, + intermediate_result.src.data(), intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.dst.data() + symmetric_result.n_edges, - intermediate_result.dst.data(), - intermediate_result.n_edges, + intermediate_result.dst.data(), intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.weights.data() + symmetric_result.n_edges, intermediate_result.weights.data(), - intermediate_result.n_edges, - handle.get_stream()); + intermediate_result.n_edges, handle.get_stream()); symmetric_result.n_edges = total_edge_size; MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), + false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), + std::move(non_symmetric_result)); } } - void SetUp() override - { - mst_input = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + mst_input = ::testing::TestWithParam< + MSTTestInput>::GetParam(); iterations = mst_input.iterations; - csr_d.offsets = rmm::device_buffer(mst_input.csr_h.offsets.data(), - mst_input.csr_h.offsets.size() * sizeof(edge_t), - handle.get_stream()); - csr_d.indices = rmm::device_buffer(mst_input.csr_h.indices.data(), - mst_input.csr_h.indices.size() * sizeof(vertex_t), - handle.get_stream()); - csr_d.weights = rmm::device_buffer(mst_input.csr_h.weights.data(), - mst_input.csr_h.weights.size() * sizeof(weight_t), - handle.get_stream()); + csr_d.offsets = rmm::device_buffer( + mst_input.csr_h.offsets.data(), + mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream()); + csr_d.indices = rmm::device_buffer( + mst_input.csr_h.indices.data(), + mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream()); + csr_d.weights = rmm::device_buffer( + mst_input.csr_h.weights.data(), + mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream()); } void TearDown() override {} @@ -260,68 +259,41 @@ const std::vector> csr_in_h = { const std::vector> csr_in4_h = { {{0, 3, 5, 8, 10, 12, 14, 16}, {2, 4, 5, 3, 6, 0, 4, 5, 1, 6, 0, 2, 0, 2, 1, 3}, - {5.0f, - 9.0f, - 1.0f, - 8.0f, - 7.0f, - 5.0f, - 2.0f, - 6.0f, - 8.0f, - 10.0f, - 9.0f, - 2.0f, - 1.0f, - 6.0f, - 7.0f, - 10.0f}}}; + {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, + 1.0f, 6.0f, 7.0f, 10.0f}}}; // singletons const std::vector> csr_in5_h = { {{0, 3, 5, 8, 10, 10, 10, 12, 14, 16, 16}, {2, 8, 7, 3, 8, 0, 8, 7, 1, 8, 0, 2, 0, 2, 1, 3}, - {5.0f, - 9.0f, - 1.0f, - 8.0f, - 7.0f, - 5.0f, - 2.0f, - 6.0f, - 8.0f, - 10.0f, - 9.0f, - 2.0f, - 1.0f, - 6.0f, - 7.0f, - 10.0f}}}; + {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, + 1.0f, 6.0f, 7.0f, 10.0f}}}; typedef MSTTest MSTTestSequential; -TEST_P(MSTTestSequential, Sequential) -{ - auto results_pair = mst_gpu(); - auto& symmetric_result = results_pair.first; - auto& non_symmetric_result = results_pair.second; +TEST_P(MSTTestSequential, Sequential) { + auto results_pair = mst_gpu(); + auto &symmetric_result = results_pair.first; + auto &non_symmetric_result = results_pair.second; // do assertions here // in this case, running sequential MST auto prims_result = prims(mst_input.csr_h); - auto symmetric_sum = thrust::reduce(thrust::device, - symmetric_result.weights.data(), - symmetric_result.weights.data() + symmetric_result.n_edges); - auto non_symmetric_sum = - thrust::reduce(thrust::device, - non_symmetric_result.weights.data(), - non_symmetric_result.weights.data() + non_symmetric_result.n_edges); - - ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, raft::CompareApprox(0.1))); - ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, raft::CompareApprox(0.1))); + auto symmetric_sum = + thrust::reduce(thrust::device, symmetric_result.weights.data(), + symmetric_result.weights.data() + symmetric_result.n_edges); + auto non_symmetric_sum = thrust::reduce( + thrust::device, non_symmetric_result.weights.data(), + non_symmetric_result.weights.data() + non_symmetric_result.n_edges); + + ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, + raft::CompareApprox(0.1))); + ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, + raft::CompareApprox(0.1))); } -INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, ::testing::ValuesIn(csr_in_h)); +INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, + ::testing::ValuesIn(csr_in_h)); } // namespace mst } // namespace raft diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index 25c8fe5084..af10dcab30 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -38,13 +38,12 @@ enum RandomType { }; template -__global__ void meanKernel(T* out, const T* data, int len) -{ +__global__ void meanKernel(T* out, const T* data, int len) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int tid = threadIdx.x + blockIdx.x * blockDim.x; - T val = tid < len ? data[tid] : T(0); - T x = BlockReduce(temp_storage).Sum(val); + T val = tid < len ? data[tid] : T(0); + T x = BlockReduce(temp_storage).Sum(val); __syncthreads(); T xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -71,8 +70,7 @@ struct RngInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) -{ +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) { return os; } @@ -82,30 +80,46 @@ template template class RngTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(params.seed, params.gtype); allocate(data, params.len); allocate(stats, 2, true); switch (params.type) { - case RNG_Normal: r.normal(data, params.len, params.start, params.end, stream); break; - case RNG_LogNormal: r.lognormal(data, params.len, params.start, params.end, stream); break; - case RNG_Uniform: r.uniform(data, params.len, params.start, params.end, stream); break; - case RNG_Gumbel: r.gumbel(data, params.len, params.start, params.end, stream); break; - case RNG_Logistic: r.logistic(data, params.len, params.start, params.end, stream); break; - case RNG_Exp: r.exponential(data, params.len, params.start, stream); break; - case RNG_Rayleigh: r.rayleigh(data, params.len, params.start, stream); break; - case RNG_Laplace: r.laplace(data, params.len, params.start, params.end, stream); break; + case RNG_Normal: + r.normal(data, params.len, params.start, params.end, stream); + break; + case RNG_LogNormal: + r.lognormal(data, params.len, params.start, params.end, stream); + break; + case RNG_Uniform: + r.uniform(data, params.len, params.start, params.end, stream); + break; + case RNG_Gumbel: + r.gumbel(data, params.len, params.start, params.end, stream); + break; + case RNG_Logistic: + r.logistic(data, params.len, params.start, params.end, stream); + break; + case RNG_Exp: + r.exponential(data, params.len, params.start, stream); + break; + case RNG_Rayleigh: + r.rayleigh(data, params.len, params.start, stream); + break; + case RNG_Laplace: + r.laplace(data, params.len, params.start, params.end, stream); + break; }; static const int threads = 128; meanKernel - <<>>(stats, data, params.len); + <<>>(stats, data, + params.len); update_host(h_stats, stats, 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -113,24 +127,23 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(stats)); } - void getExpectedMeanVar(T meanvar[2]) - { + void getExpectedMeanVar(T meanvar[2]) { switch (params.type) { case RNG_Normal: meanvar[0] = params.start; meanvar[1] = params.end * params.end; break; case RNG_LogNormal: { - auto var = params.end * params.end; - auto mu = params.start; + auto var = params.end * params.end; + auto mu = params.start; meanvar[0] = raft::myExp(mu + var * T(0.5)); - meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); + meanvar[1] = + (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); break; } case RNG_Uniform: @@ -154,7 +167,8 @@ class RngTest : public ::testing::TestWithParam> { break; case RNG_Rayleigh: meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0)); - meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; + meanvar[1] = + ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; break; case RNG_Laplace: meanvar[0] = params.start; @@ -245,12 +259,13 @@ const std::vector> inputsf = { {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestF, Result) -{ +TEST_P(RngTestF, Result) { float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], + CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], + CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf)); @@ -306,12 +321,13 @@ const std::vector> inputsd = { {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL}, {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestD, Result) -{ +TEST_P(RngTestD, Result) { double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], + CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], + CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); @@ -319,8 +335,7 @@ INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); // Test for expected variance in mean calculations template -T quick_mean(const std::vector& d) -{ +T quick_mean(const std::vector& d) { T acc = T(0); for (const auto& di : d) { acc += di; @@ -329,9 +344,8 @@ T quick_mean(const std::vector& d) } template -T quick_std(const std::vector& d) -{ - T acc = T(0); +T quick_std(const std::vector& d) { + T acc = T(0); T d_mean = quick_mean(d); for (const auto& di : d) { acc += ((di - d_mean) * (di - d_mean)); @@ -340,8 +354,7 @@ T quick_std(const std::vector& d) } template -std::ostream& operator<<(std::ostream& out, const std::vector& v) -{ +std::ostream& operator<<(std::ostream& out, const std::vector& v) { if (!v.empty()) { out << '['; std::copy(v.begin(), v.end(), std::ostream_iterator(out, ", ")); @@ -356,12 +369,11 @@ std::ostream& operator<<(std::ostream& out, const std::vector& v) // experiments computing the mean, giving us a distribution of the mean // itself. The mean error is simply the standard deviation of this // distribution (the standard deviation of the mean). -TEST(Rng, MeanError) -{ +TEST(Rng, MeanError) { timeb time_struct; ftime(&time_struct); - int seed = time_struct.millitm; - int num_samples = 1024; + int seed = time_struct.millitm; + int num_samples = 1024; int num_experiments = 1024; float* data; float* mean_result; @@ -379,9 +391,10 @@ TEST(Rng, MeanError) Rng r(seed, rtype); r.normal(data, len, 3.3f, 0.23f, stream); // r.uniform(data, len, -1.0, 2.0); - raft::stats::mean(mean_result, data, num_samples, num_experiments, false, false, stream); - raft::stats::stddev( - std_result, data, mean_result, num_samples, num_experiments, false, false, stream); + raft::stats::mean(mean_result, data, num_samples, num_experiments, false, + false, stream); + raft::stats::stddev(std_result, data, mean_result, num_samples, + num_experiments, false, false, stream); std::vector h_mean_result(num_experiments); std::vector h_std_result(num_experiments); update_host(h_mean_result.data(), mean_result, num_experiments, stream); @@ -390,8 +403,8 @@ TEST(Rng, MeanError) auto d_mean = quick_mean(h_mean_result); // std-dev of mean; also known as mean error - auto d_std_of_mean = quick_std(h_mean_result); - auto d_std = quick_mean(h_std_result); + auto d_std_of_mean = quick_std(h_mean_result); + auto d_std = quick_mean(h_std_result); auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples); // std::cout << "measured mean error: " << d_std_of_mean << "\n"; @@ -400,7 +413,8 @@ TEST(Rng, MeanError) auto diff_expected_vs_measured_mean_error = std::abs(d_std_of_mean - d_std / std::sqrt(num_samples)); - ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); + ASSERT_TRUE( + (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); } CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(data)); @@ -413,8 +427,7 @@ TEST(Rng, MeanError) template class ScaledBernoulliTest : public ::testing::Test { protected: - void SetUp() override - { + void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); @@ -425,12 +438,12 @@ class ScaledBernoulliTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaFree(data)); } - void rangeCheck() - { + void rangeCheck() { T* h_data = new T[len]; update_host(h_data, data, len, stream); - ASSERT_TRUE( - std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; })); + ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) { + return a < -scale || a > scale; + })); delete[] h_data; } @@ -447,8 +460,7 @@ TEST_F(ScaledBernoulliTest2, RangeCheck) { rangeCheck(); } template class BernoulliTest : public ::testing::Test { protected: - void SetUp() override - { + void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); allocate(data, len * sizeof(bool), stream); @@ -457,8 +469,7 @@ class BernoulliTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaFree(data)); } - void trueFalseCheck() - { + void trueFalseCheck() { // both true and false values must be present bool* h_data = new bool[len]; update_host(h_data, data, len, stream); @@ -488,21 +499,21 @@ struct RngNormalTableInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const RngNormalTableInputs& dims) -{ +::std::ostream& operator<<(::std::ostream& os, + const RngNormalTableInputs& dims) { return os; } template -class RngNormalTableTest : public ::testing::TestWithParam> { +class RngNormalTableTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; - params = ::testing::TestWithParam>::GetParam(); - int len = params.rows * params.cols; + params = ::testing::TestWithParam>::GetParam(); + int len = params.rows * params.cols; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -512,9 +523,11 @@ class RngNormalTableTest : public ::testing::TestWithParam<<>>(stats, data, len); + meanKernel + <<>>(stats, data, len); update_host(h_stats, stats, 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= len; @@ -522,15 +535,13 @@ class RngNormalTableTest : public ::testing::TestWithParam> inputsf_t = { {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestF, Result) -{ +TEST_P(RngNormalTableTestF, Result) { float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], + CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], + CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, ::testing::ValuesIn(inputsf_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, + ::testing::ValuesIn(inputsf_t)); typedef RngNormalTableTest RngNormalTableTestD; const std::vector> inputsd_t = { @@ -568,14 +581,16 @@ const std::vector> inputsd_t = { {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL}, {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestD, Result) -{ +TEST_P(RngNormalTableTestD, Result) { double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], + CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], + CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, ::testing::ValuesIn(inputsd_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, + ::testing::ValuesIn(inputsd_t)); struct RngAffineInputs { int n; @@ -584,15 +599,13 @@ struct RngAffineInputs { class RngAffineTest : public ::testing::TestWithParam { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam::GetParam(); Rng r(params.seed); r.affine_transform_params(params.n, a, b); } - void check() - { + void check() { ASSERT_TRUE(gcd(a, params.n) == 1); ASSERT_TRUE(0 <= b && b < params.n); } @@ -603,17 +616,13 @@ class RngAffineTest : public ::testing::TestWithParam { }; // RngAffineTest const std::vector inputs_affine = { - {100, 123456ULL}, - {100, 1234567890ULL}, - {101, 123456ULL}, - {101, 1234567890ULL}, - {7, 123456ULL}, - {7, 1234567890ULL}, - {2568, 123456ULL}, - {2568, 1234567890ULL}, + {100, 123456ULL}, {100, 1234567890ULL}, {101, 123456ULL}, + {101, 1234567890ULL}, {7, 123456ULL}, {7, 1234567890ULL}, + {2568, 123456ULL}, {2568, 1234567890ULL}, }; TEST_P(RngAffineTest, Result) { check(); } -INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, ::testing::ValuesIn(inputs_affine)); +INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, + ::testing::ValuesIn(inputs_affine)); } // namespace random } // namespace raft diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index c77c3df526..92f12206e8 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -27,13 +27,12 @@ namespace random { enum RandomType { RNG_Uniform }; template -__global__ void meanKernel(float* out, const T* data, int len) -{ +__global__ void meanKernel(float *out, const T *data, int len) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - int tid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; float val = tid < len ? data[tid] : T(0); - float x = BlockReduce(temp_storage).Sum(val); + float x = BlockReduce(temp_storage).Sum(val); __syncthreads(); float xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -60,16 +59,14 @@ struct RngInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const RngInputs &dims) { return os; } template class RngTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); Rng r(params.seed, params.gtype); @@ -78,11 +75,14 @@ class RngTest : public ::testing::TestWithParam> { allocate(data, params.len); allocate(stats, 2, true); switch (params.type) { - case RNG_Uniform: r.uniformInt(data, params.len, params.start, params.end, stream); break; + case RNG_Uniform: + r.uniformInt(data, params.len, params.start, params.end, stream); + break; }; static const int threads = 128; meanKernel - <<>>(stats, data, params.len); + <<>>(stats, data, + params.len); update_host(h_stats, stats, 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -90,14 +90,12 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(stats)); } - void getExpectedMeanVar(float meanvar[2]) - { + void getExpectedMeanVar(float meanvar[2]) { switch (params.type) { case RNG_Uniform: meanvar[0] = (params.start + params.end) * 0.5f; @@ -109,8 +107,8 @@ class RngTest : public ::testing::TestWithParam> { protected: RngInputs params; - T* data; - float* stats; + T *data; + float *stats; float h_stats[2]; // mean, var }; @@ -122,12 +120,13 @@ const std::vector> inputs_u32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU32, Result) -{ +TEST_P(RngTestU32, Result) { float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32)); @@ -139,12 +138,13 @@ const std::vector> inputs_u64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU64, Result) -{ +TEST_P(RngTestU64, Result) { float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64)); @@ -156,12 +156,13 @@ const std::vector> inputs_s32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS32, Result) -{ +TEST_P(RngTestS32, Result) { float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32)); @@ -173,12 +174,13 @@ const std::vector> inputs_s64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS64, Result) -{ +TEST_P(RngTestS64, Result) { float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE( + match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64)); diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index c258841c3e..d7e52a8958 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -38,16 +38,14 @@ struct SWoRInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) -{ +::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) { return os; } template class SWoRTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); CUDA_CHECK(cudaStreamCreate(&stream)); @@ -60,14 +58,15 @@ class SWoRTest : public ::testing::TestWithParam> { r.uniform(in, params.len, T(-1.0), T(1.0), stream); r.uniform(wts, params.len, T(1.0), T(2.0), stream); if (params.largeWeightIndex >= 0) { - update_device(wts + params.largeWeightIndex, ¶ms.largeWeight, 1, stream); + update_device(wts + params.largeWeightIndex, ¶ms.largeWeight, 1, + stream); } - r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen, params.len, stream); + r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen, + params.len, stream); update_host(&(h_outIdx[0]), outIdx, params.sampledLen, stream); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaFree(in)); @@ -148,14 +147,14 @@ const std::vector> inputsf = { {1024, 512, 10, 100000.f, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestF, Result) -{ +TEST_P(SWoRTestF, Result) { std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val + << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -163,7 +162,9 @@ TEST_P(SWoRTestF, Result) } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } + if (params.largeWeightIndex >= 0) { + ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); + } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf)); @@ -230,14 +231,14 @@ const std::vector> inputsd = { {1024, 512, 10, 100000.0, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestD, Result) -{ +TEST_P(SWoRTestD, Result) { std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val + << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -245,7 +246,9 @@ TEST_P(SWoRTestD, Result) } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } + if (params.largeWeightIndex >= 0) { + ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); + } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd)); diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index e1f814a5b6..713708d4cd 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -44,14 +44,14 @@ struct CSRAddInputs { }; template -class CSRAddTest : public ::testing::TestWithParam> { +class CSRAddTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); - n_rows = params.matrix_a.row_ind.size(); - nnz_a = params.matrix_a.row_ind_ptr.size(); - nnz_b = params.matrix_b.row_ind_ptr.size(); + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + n_rows = params.matrix_a.row_ind.size(); + nnz_a = params.matrix_a.row_ind_ptr.size(); + nnz_b = params.matrix_b.row_ind_ptr.size(); nnz_result = params.matrix_verify.row_ind_ptr.size(); cudaStreamCreate(&stream); @@ -73,61 +73,46 @@ class CSRAddTest : public ::testing::TestWithParam> raft::allocate(values_result, nnz_result); } - void Run() - { - std::shared_ptr alloc(new raft::mr::device::default_allocator); + void Run() { + std::shared_ptr alloc( + new raft::mr::device::default_allocator); raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream); - raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, stream); + raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, + stream); raft::update_device(values_a, params.matrix_a.values.data(), nnz_a, stream); raft::update_device(ind_b, params.matrix_b.row_ind.data(), n_rows, stream); - raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b, stream); + raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b, + stream); raft::update_device(values_b, params.matrix_b.values.data(), nnz_b, stream); - raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows, stream); - raft::update_device( - ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(), nnz_result, stream); - raft::update_device(values_verify, params.matrix_verify.values.data(), nnz_result, stream); - - Index_ nnz = linalg::csr_add_calc_inds(ind_a, - ind_ptr_a, - values_a, - nnz_a, - ind_b, - ind_ptr_b, - values_b, - nnz_b, - n_rows, - ind_result, - alloc, - stream); + raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows, + stream); + raft::update_device(ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(), + nnz_result, stream); + raft::update_device(values_verify, params.matrix_verify.values.data(), + nnz_result, stream); + + Index_ nnz = linalg::csr_add_calc_inds( + ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, + n_rows, ind_result, alloc, stream); ASSERT_TRUE(nnz == nnz_result); - ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, raft::Compare())); - - linalg::csr_add_finalize(ind_a, - ind_ptr_a, - values_a, - nnz_a, - ind_b, - ind_ptr_b, - values_b, - nnz_b, - n_rows, - ind_result, - ind_ptr_result, - values_result, - stream); - - ASSERT_TRUE( - raft::devArrMatch(ind_ptr_verify, ind_ptr_result, nnz, raft::Compare())); - ASSERT_TRUE( - raft::devArrMatch(values_verify, values_result, nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(ind_verify, ind_result, n_rows, + raft::Compare())); + + linalg::csr_add_finalize( + ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b, + n_rows, ind_result, ind_ptr_result, values_result, stream); + + ASSERT_TRUE(raft::devArrMatch(ind_ptr_verify, ind_ptr_result, nnz, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(values_verify, values_result, nnz, + raft::Compare())); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(ind_a)); CUDA_CHECK(cudaFree(ind_b)); CUDA_CHECK(cudaFree(ind_result)); @@ -146,8 +131,8 @@ class CSRAddTest : public ::testing::TestWithParam> CSRAddInputs params; cudaStream_t stream; Index_ n_rows, nnz_a, nnz_b, nnz_result; - Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b, *ind_ptr_verify, - *ind_ptr_result; + Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b, + *ind_ptr_verify, *ind_ptr_result; Type_f *values_a, *values_b, *values_verify, *values_result; }; @@ -180,8 +165,10 @@ const std::vector> csradd_inputs_d = { {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}}, }; -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, ::testing::ValuesIn(csradd_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, ::testing::ValuesIn(csradd_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, + ::testing::ValuesIn(csradd_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, + ::testing::ValuesIn(csradd_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index 3678d34bbe..d98f9de9c3 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -51,24 +51,26 @@ struct ConnectComponentsInputs { }; template -class ConnectComponentsTest - : public ::testing::TestWithParam> { +class ConnectComponentsTest : public ::testing::TestWithParam< + ConnectComponentsInputs> { protected: - void basicTest() - { + void basicTest() { raft::handle_t handle; auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); - params = ::testing::TestWithParam>::GetParam(); + params = ::testing::TestWithParam< + ConnectComponentsInputs>::GetParam(); - raft::sparse::COO out_edges(handle.get_device_allocator(), - handle.get_stream()); + raft::sparse::COO out_edges( + handle.get_device_allocator(), handle.get_stream()); - rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, + handle.get_stream()); - raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), + handle.get_stream()); rmm::device_uvector indptr(params.n_row + 1, stream); @@ -77,58 +79,44 @@ class ConnectComponentsTest */ raft::sparse::COO knn_graph_coo(d_alloc, stream); - raft::sparse::selection::knn_graph(handle, - data.data(), - params.n_row, - params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, - knn_graph_coo, - params.c); + raft::sparse::selection::knn_graph( + handle, data.data(), params.n_row, params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, knn_graph_coo, params.c); - raft::sparse::convert::sorted_coo_to_csr( - knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), + knn_graph_coo.nnz, indptr.data(), + params.n_row + 1, d_alloc, stream); /** * 2. Construct MST, sorted by weights */ rmm::device_uvector colors(params.n_row, stream); - auto mst_coo = raft::mst::mst(handle, - indptr.data(), - knn_graph_coo.cols(), - knn_graph_coo.vals(), - params.n_row, - knn_graph_coo.nnz, - colors.data(), - stream, - false, - true); + auto mst_coo = raft::mst::mst( + handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(), + params.n_row, knn_graph_coo.nnz, colors.data(), stream, false, true); /** * 3. connect_components to fix connectivities */ - raft::linkage::FixConnectivitiesRedOp red_op(colors.data(), params.n_row); + raft::linkage::FixConnectivitiesRedOp red_op( + colors.data(), params.n_row); raft::linkage::connect_components( - handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op); + handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, + red_op); /** * Construct final edge list */ rmm::device_uvector indptr2(params.n_row + 1, stream); - raft::sparse::convert::sorted_coo_to_csr( - out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, d_alloc, stream); + raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz, + indptr2.data(), params.n_row + 1, + d_alloc, stream); - auto output_mst = raft::mst::mst(handle, - indptr2.data(), - out_edges.cols(), - out_edges.vals(), - params.n_row, - out_edges.nnz, - colors.data(), - stream, - false, - false); + auto output_mst = raft::mst::mst( + handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row, + out_edges.nnz, colors.data(), stream, false, false); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -150,199 +138,366 @@ const std::vector> fix_conn_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, - 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, - 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, - 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, - 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, + 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, + 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, + 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, + 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, + 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, + 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, -1}, // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, + 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, + 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, + 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, + 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, + 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, + 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, + 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, + 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, + 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, + 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, + 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, + 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, + 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, + 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, + 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, + 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, + 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, + 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, + 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, + 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, + 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, + 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, + 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, + 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, + 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, + 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, + 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, + 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, + 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, + 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, + 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, + 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, + 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, + 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, + 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, + 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, + 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, + 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, + 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, + 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, + 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, + 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, + 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, + 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, + 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, + 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, + 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, + 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, + 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, + 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, + 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, + 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, + 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, + 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, + 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, + 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, + 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, + 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, + 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, + 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, + 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, + 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, + 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, + 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, + 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, + 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, + 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, + 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, + 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, + 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, + 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, + 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, + 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, + 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, + 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, + 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, + 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, + 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, + 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, + 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, + 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, + 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, + 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, + 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, + 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, + 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, + 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, + 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, + 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, + 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, + 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, + 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, + 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, + 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, + 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, + 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, + 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, + 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, + 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, + 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, + 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, + 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, + 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, + 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, + 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, + 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, + 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, + 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, + 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, + 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, + 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, + 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, + 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, + 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, + 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, + 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, + 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, + 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, + 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, + 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, + 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, + 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, + 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, + 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, + 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, + 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, + 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, + 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, + 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, + 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, + 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, + 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, + 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, + 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, + 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, + 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, + 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, + 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, + 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, + 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, + 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, + 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, + 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, + 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, + 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, + 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, + 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, + 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, + 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, + 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, + 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, + 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, + 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, + 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, + 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, + 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, + 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, + 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, + 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, + 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, + 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, + 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, + 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, + 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, + 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, + 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, + 8.66342445e-01 }, -4}}; typedef ConnectComponentsTest ConnectComponentsTestF_Int; -TEST_P(ConnectComponentsTestF_Int, Result) -{ +TEST_P(ConnectComponentsTestF_Int, Result) { /** - * Verify the src & dst vertices on each edge have different colors - */ + * Verify the src & dst vertices on each edge have different colors + */ EXPECT_TRUE(final_edges == params.n_row - 1); } -INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, - ConnectComponentsTestF_Int, +INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, ConnectComponentsTestF_Int, ::testing::ValuesIn(fix_conn_inputsf2)); }; // namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index 2e4c2c1a14..ea69ecfc53 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -39,8 +39,7 @@ struct CSRtoCOOInputs { template class CSRtoCOOTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); @@ -49,21 +48,20 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { raft::allocate(result, params.verify.size(), true); } - void Run() - { + void Run() { Index_ n_rows = params.ex_scan.size(); - Index_ nnz = params.verify.size(); + Index_ nnz = params.verify.size(); raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); raft::update_device(verify, params.verify.data(), nnz, stream); convert::csr_to_coo(ex_scan, n_rows, result, nnz, stream); - ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare(), stream)); + ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, + raft::Compare(), stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(ex_scan)); CUDA_CHECK(cudaFree(verify)); CUDA_CHECK(cudaFree(result)); @@ -91,11 +89,9 @@ const std::vector> csrtocoo_inputs_64 = { {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, - CSRtoCOOTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI, ::testing::ValuesIn(csrtocoo_inputs_32)); -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, - CSRtoCOOTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL, ::testing::ValuesIn(csrtocoo_inputs_64)); } // namespace sparse diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index b2878081ae..553ef2ddee 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -37,13 +37,14 @@ struct SparseConvertCSRInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SparseConvertCSRInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const SparseConvertCSRInputs &dims) { return os; } template -class SparseConvertCSRTest : public ::testing::TestWithParam> { +class SparseConvertCSRTest + : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -53,21 +54,22 @@ class SparseConvertCSRTest : public ::testing::TestWithParam params; }; -const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; +const std::vector> inputsf = { + {5, 10, 5, 1234ULL}}; typedef SparseConvertCSRTest SortedCOOToCSR; -TEST_P(SortedCOOToCSR, Result) -{ +TEST_P(SortedCOOToCSR, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc(new raft::mr::device::default_allocator); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); int nnz = 8; int *in, *out, *exp; - int* in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int* exp_h = new int[4]{0, 2, 4, 6}; + int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int *exp_h = new int[4]{0, 2, 4, 6}; raft::allocate(in, nnz, true); raft::allocate(exp, 4, true); @@ -90,7 +92,8 @@ TEST_P(SortedCOOToCSR, Result) CUDA_CHECK(cudaFree(out)); } -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, + ::testing::ValuesIn(inputsf)); /******************************** adj graph ********************************/ @@ -104,10 +107,10 @@ struct CSRAdjGraphInputs { }; template -class CSRAdjGraphTest : public ::testing::TestWithParam> { +class CSRAdjGraphTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); nnz = params.verify.size(); @@ -118,21 +121,20 @@ class CSRAdjGraphTest : public ::testing::TestWithParam(params.adj.data()), params.n_rows * params.n_cols, stream); + raft::update_device(adj, reinterpret_cast(params.adj.data()), + params.n_rows * params.n_cols, stream); raft::update_device(verify, params.verify.data(), nnz, stream); convert::csr_adj_graph_batched( row_ind, params.n_cols, nnz, params.n_rows, adj, result, stream); - ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify, result, nnz, raft::Compare())); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(row_ind)); CUDA_CHECK(cudaFree(adj)); CUDA_CHECK(cudaFree(verify)); @@ -145,7 +147,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam; @@ -169,11 +171,9 @@ const std::vector> csradjgraph_inputs_l = { {0, 1, 2, 0, 1, 2, 0, 1, 2}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, - CSRAdjGraphTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI, ::testing::ValuesIn(csradjgraph_inputs_i)); -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, - CSRAdjGraphTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL, ::testing::ValuesIn(csradjgraph_inputs_l)); } // namespace sparse diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index fe43f0d182..625772a842 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -47,19 +47,19 @@ struct CSRRowSliceInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const CSRRowSliceInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const CSRRowSliceInputs &dims) { return os; } template -class CSRRowSliceTest : public ::testing::TestWithParam> { +class CSRRowSliceTest + : public ::testing::TestWithParam> { protected: - void make_data() - { - std::vector indptr_h = params.indptr_h; + void make_data() { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -69,27 +69,31 @@ class CSRRowSliceTest : public ::testing::TestWithParam out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; + std::vector out_data_ref_h = params.out_data_ref_h; allocate(out_indptr_ref, out_indptr_ref_h.size()); allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_data_ref, out_data_ref_h.size()); - update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); - update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); - update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); + update_device(out_indptr_ref, out_indptr_ref_h.data(), + out_indptr_ref_h.size(), stream); + update_device(out_indices_ref, out_indices_ref_h.data(), + out_indices_ref_h.size(), stream); + update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), + stream); allocate(out_indptr, out_indptr_ref_h.size()); allocate(out_indices, out_indices_ref_h.size()); allocate(out_data, out_data_ref_h.size()); } - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); - std::shared_ptr alloc(new raft::mr::device::default_allocator); + void SetUp() override { + params = ::testing::TestWithParam< + CSRRowSliceInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); make_data(); @@ -97,22 +101,18 @@ class CSRRowSliceTest : public ::testing::TestWithParam())); - ASSERT_TRUE(devArrMatch( - out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare())); - ASSERT_TRUE( - devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare())); + void compare() { + ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref, + params.out_indptr_ref_h.size(), + Compare())); + ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref, + params.out_indices_ref_h.size(), + Compare())); + ASSERT_TRUE(devArrMatch(out_data, out_data_ref, + params.out_data_ref_h.size(), Compare())); } protected: @@ -140,15 +141,15 @@ class CSRRowSliceTest : public ::testing::TestWithParam params; }; @@ -176,7 +177,8 @@ const std::vector> inputs_i32_f = { }; typedef CSRRowSliceTest CSRRowSliceTestF; TEST_P(CSRRowSliceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, + ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index 286493ada7..5535df4fe3 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -43,19 +43,19 @@ struct CSRToDenseInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const CSRToDenseInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const CSRToDenseInputs &dims) { return os; } template -class CSRToDenseTest : public ::testing::TestWithParam> { +class CSRToDenseTest + : public ::testing::TestWithParam> { protected: - void make_data() - { - std::vector indptr_h = params.indptr_h; + void make_data() { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -74,24 +74,24 @@ class CSRToDenseTest : public ::testing::TestWithParam>::GetParam(); - std::shared_ptr alloc(new raft::mr::device::default_allocator); + void SetUp() override { + params = ::testing::TestWithParam< + CSRToDenseInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - convert::csr_to_dense( - handle, params.nrows, params.ncols, indptr, indices, data, params.nrows, out, stream, true); + convert::csr_to_dense(handle, params.nrows, params.ncols, indptr, indices, + data, params.nrows, out, stream, true); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -100,9 +100,9 @@ class CSRToDenseTest : public ::testing::TestWithParam())); + void compare() { + ASSERT_TRUE( + devArrMatch(out, out_ref, params.out_ref_h.size(), Compare())); } protected: @@ -111,13 +111,13 @@ class CSRToDenseTest : public ::testing::TestWithParam params; }; @@ -128,26 +128,13 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 2, 3, 0, 1, 2, 3}, // indices {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, - {1.0f, - 3.0f, - 0.0f, - 0.0f, - 0.0f, - 0.0f, - 1.0f, - 5.0f, - 50.0f, - 28.0f, - 0.0f, - 0.0f, - 0.0f, - 0.0f, - 16.0f, - 2.0f}}, + {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 16.0f, 2.0f}}, }; typedef CSRToDenseTest CSRToDenseTestF; TEST_P(CSRToDenseTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, + ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index 87b8b17073..c257d6eb3c 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -49,19 +49,19 @@ struct CSRTransposeInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const CSRTransposeInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const CSRTransposeInputs &dims) { return os; } template -class CSRTransposeTest : public ::testing::TestWithParam> { +class CSRTransposeTest + : public ::testing::TestWithParam> { protected: - void make_data() - { - std::vector indptr_h = params.indptr_h; + void make_data() { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -71,51 +71,45 @@ class CSRTransposeTest : public ::testing::TestWithParam out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; + std::vector out_data_ref_h = params.out_data_ref_h; allocate(out_indptr_ref, out_indptr_ref_h.size()); allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_data_ref, out_data_ref_h.size()); - update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); - update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); - update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream); + update_device(out_indptr_ref, out_indptr_ref_h.data(), + out_indptr_ref_h.size(), stream); + update_device(out_indices_ref, out_indices_ref_h.data(), + out_indices_ref_h.size(), stream); + update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), + stream); allocate(out_indptr, out_indptr_ref_h.size()); allocate(out_indices, out_indices_ref_h.size()); allocate(out_data, out_data_ref_h.size()); } - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); - std::shared_ptr alloc(new raft::mr::device::default_allocator); + void SetUp() override { + params = ::testing::TestWithParam< + CSRTransposeInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - raft::sparse::linalg::csr_transpose(handle, - indptr, - indices, - data, - out_indptr, - out_indices, - out_data, - params.nrows, - params.ncols, - params.nnz, - alloc, - stream); + raft::sparse::linalg::csr_transpose( + handle, indptr, indices, data, out_indptr, out_indices, out_data, + params.nrows, params.ncols, params.nnz, alloc, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -128,14 +122,15 @@ class CSRTransposeTest : public ::testing::TestWithParam())); - ASSERT_TRUE(devArrMatch( - out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare())); - ASSERT_TRUE( - devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare())); + void compare() { + ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref, + params.out_indptr_ref_h.size(), + Compare())); + ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref, + params.out_indices_ref_h.size(), + Compare())); + ASSERT_TRUE(devArrMatch(out_data, out_data_ref, + params.out_data_ref_h.size(), Compare())); } protected: @@ -144,15 +139,15 @@ class CSRTransposeTest : public ::testing::TestWithParam params; }; @@ -172,7 +167,8 @@ const std::vector> inputs_i32_f = { }; typedef CSRTransposeTest CSRTransposeTestF; TEST_P(CSRTransposeTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, + ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index c6b2a27273..5d687ad92b 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -33,7 +33,8 @@ struct SparseDegreeInputs { }; template -class SparseDegreeTests : public ::testing::TestWithParam> { +class SparseDegreeTests + : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -46,12 +47,11 @@ class SparseDegreeTests : public ::testing::TestWithParam> const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseDegreeTests COODegree; -TEST_P(COODegree, Result) -{ +TEST_P(COODegree, Result) { int *in_rows, *verify, *results; int in_rows_h[5] = {0, 0, 1, 2, 2}; - int verify_h[5] = {2, 1, 2, 0, 0}; + int verify_h[5] = {2, 1, 2, 0, 0}; raft::allocate(in_rows, 5); raft::allocate(verify, 5, true); @@ -70,17 +70,16 @@ TEST_P(COODegree, Result) } typedef SparseDegreeTests COODegreeNonzero; -TEST_P(COODegreeNonzero, Result) -{ +TEST_P(COODegreeNonzero, Result) { cudaStream_t stream; cudaStreamCreate(&stream); int *in_rows, *verify, *results; - float* in_vals; + float *in_vals; - int in_rows_h[5] = {0, 0, 1, 2, 2}; + int in_rows_h[5] = {0, 0, 1, 2, 2}; float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0}; - int verify_h[5] = {1, 0, 2, 0, 0}; + int verify_h[5] = {1, 0, 2, 0, 0}; raft::allocate(in_rows, 5); raft::allocate(verify, 5, true); @@ -102,8 +101,10 @@ TEST_P(COODegreeNonzero, Result) CUDA_CHECK(cudaStreamDestroy(stream)); } -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, + ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, + ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index 7c0db49a04..a83b93f83f 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -55,82 +55,71 @@ struct InputConfiguration { }; using dense_smem_strategy_t = dense_smem_strategy; -using hash_strategy_t = hash_strategy; +using hash_strategy_t = hash_strategy; template struct SparseDistanceCOOSPMVInputs { InputConfiguration input_configuration; float capacity_threshold = 0.5; - int map_size = hash_strategy::get_map_size(); + int map_size = hash_strategy::get_map_size(); }; template -::std::ostream& operator<<(::std::ostream& os, - const SparseDistanceCOOSPMVInputs& dims) -{ +::std::ostream &operator<<( + ::std::ostream &os, + const SparseDistanceCOOSPMVInputs &dims) { return os; } template class SparseDistanceCOOSPMVTest - : public ::testing::TestWithParam> { + : public ::testing::TestWithParam< + SparseDistanceCOOSPMVInputs> { public: SparseDistanceCOOSPMVTest() : dist_config(handle) {} - template >* = nullptr> - U make_strategy() - { + template > * = nullptr> + U make_strategy() { return strategy_t(dist_config, params.capacity_threshold, params.map_size); } - template >* = nullptr> - U make_strategy() - { + template > * = nullptr> + U make_strategy() { return strategy_t(dist_config); } template - void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true) - { - raft::mr::device::buffer coo_rows(dist_config.handle.get_device_allocator(), - dist_config.handle.get_stream(), - max(dist_config.b_nnz, dist_config.a_nnz)); - - raft::sparse::convert::csr_to_coo(dist_config.b_indptr, - dist_config.b_nrows, - coo_rows.data(), - dist_config.b_nnz, + void compute_dist(reduce_f reduce_func, accum_f accum_func, + write_f write_func, bool rev = true) { + raft::mr::device::buffer coo_rows( + dist_config.handle.get_device_allocator(), + dist_config.handle.get_stream(), + max(dist_config.b_nnz, dist_config.a_nnz)); + + raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows, + coo_rows.data(), dist_config.b_nnz, dist_config.handle.get_stream()); strategy_t selected_strategy = make_strategy(); - balanced_coo_pairwise_generalized_spmv(out_dists, - dist_config, - coo_rows.data(), - reduce_func, - accum_func, - write_func, - selected_strategy); + balanced_coo_pairwise_generalized_spmv( + out_dists, dist_config, coo_rows.data(), reduce_func, accum_func, + write_func, selected_strategy); if (rev) { - raft::sparse::convert::csr_to_coo(dist_config.a_indptr, - dist_config.a_nrows, - coo_rows.data(), - dist_config.a_nnz, - dist_config.handle.get_stream()); - - balanced_coo_pairwise_generalized_spmv_rev(out_dists, - dist_config, - coo_rows.data(), - reduce_func, - accum_func, - write_func, - selected_strategy); + raft::sparse::convert::csr_to_coo( + dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(), + dist_config.a_nnz, dist_config.handle.get_stream()); + + balanced_coo_pairwise_generalized_spmv_rev( + out_dists, dist_config, coo_rows.data(), reduce_func, accum_func, + write_func, selected_strategy); } } - void run_spmv() - { + void run_spmv() { switch (params.input_configuration.metric) { case raft::distance::DistanceType::InnerProduct: compute_dist(Product(), Sum(), AtomicAdd(), true); @@ -140,69 +129,75 @@ class SparseDistanceCOOSPMVTest break; case raft::distance::DistanceType::Canberra: compute_dist( - [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); }, - Sum(), - AtomicAdd()); + [] __device__(value_t a, value_t b) { + return fabsf(a - b) / (fabsf(a) + fabsf(b)); + }, + Sum(), AtomicAdd()); + break; + case raft::distance::DistanceType::L1: + compute_dist(AbsDiff(), Sum(), AtomicAdd()); + break; + case raft::distance::DistanceType::Linf: + compute_dist(AbsDiff(), Max(), AtomicMax()); break; - case raft::distance::DistanceType::L1: compute_dist(AbsDiff(), Sum(), AtomicAdd()); break; - case raft::distance::DistanceType::Linf: compute_dist(AbsDiff(), Max(), AtomicMax()); break; case raft::distance::DistanceType::LpUnexpanded: { - compute_dist(PDiff(params.input_configuration.metric_arg), Sum(), AtomicAdd()); + compute_dist(PDiff(params.input_configuration.metric_arg), Sum(), + AtomicAdd()); float p = 1.0f / params.input_configuration.metric_arg; raft::linalg::unaryOp( - out_dists, - out_dists, - dist_config.a_nrows * dist_config.b_nrows, + out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows, [=] __device__(value_t input) { return powf(input, p); }, dist_config.handle.get_stream()); } break; - default: throw raft::exception("Unknown distance"); + default: + throw raft::exception("Unknown distance"); } } protected: - void make_data() - { - std::vector indptr_h = params.input_configuration.indptr_h; + void make_data() { + std::vector indptr_h = params.input_configuration.indptr_h; std::vector indices_h = params.input_configuration.indices_h; - std::vector data_h = params.input_configuration.data_h; + std::vector data_h = params.input_configuration.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); allocate(data, data_h.size()); - update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); - update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream()); + update_device(indptr, indptr_h.data(), indptr_h.size(), + handle.get_stream()); + update_device(indices, indices_h.data(), indices_h.size(), + handle.get_stream()); update_device(data, data_h.data(), data_h.size(), handle.get_stream()); - std::vector out_dists_ref_h = params.input_configuration.out_dists_ref_h; + std::vector out_dists_ref_h = + params.input_configuration.out_dists_ref_h; allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); - update_device( - out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + handle.get_stream()); } - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam< SparseDistanceCOOSPMVInputs>::GetParam(); make_data(); - dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.b_ncols = params.input_configuration.n_cols; - dist_config.b_nnz = params.input_configuration.indices_h.size(); - dist_config.b_indptr = indptr; + dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.b_ncols = params.input_configuration.n_cols; + dist_config.b_nnz = params.input_configuration.indices_h.size(); + dist_config.b_indptr = indptr; dist_config.b_indices = indices; - dist_config.b_data = data; - dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.a_ncols = params.input_configuration.n_cols; - dist_config.a_nnz = params.input_configuration.indices_h.size(); - dist_config.a_indptr = indptr; + dist_config.b_data = data; + dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.a_ncols = params.input_configuration.n_cols; + dist_config.a_nnz = params.input_configuration.indices_h.size(); + dist_config.a_indptr = indptr; dist_config.a_indices = indices; - dist_config.a_data = data; + dist_config.a_data = data; int out_size = dist_config.a_nrows * dist_config.b_nrows; @@ -213,8 +208,7 @@ class SparseDistanceCOOSPMVTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -223,10 +217,8 @@ class SparseDistanceCOOSPMVTest CUDA_CHECK(cudaFree(out_dists_ref)); } - void compare() - { - ASSERT_TRUE(devArrMatch(out_dists_ref, - out_dists, + void compare() { + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, params.input_configuration.out_dists_ref_h.size(), CompareApprox(1e-3))); } @@ -236,7 +228,7 @@ class SparseDistanceCOOSPMVTest // input data value_idx *indptr, *indices; - value_t* data; + value_t *data; // output data value_t *out_dists, *out_dists_ref; @@ -251,7 +243,8 @@ const InputConfiguration input_inner_product = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}; @@ -282,379 +275,384 @@ const InputConfiguration input_l2_unexpanded = { raft::distance::DistanceType::L2Unexpanded, 0.0}; -const InputConfiguration input_canberra = { - 10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, - 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, - 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, - 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, - 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, - 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 3.3954660629919076, - 5.6469232737388815, - 6.373112846266441, - 4.0212880272531715, - 6.916281504639404, - 5.741508386786526, - 5.411470999663036, - 9.0, - 4.977014354725805, - 3.3954660629919076, - 0.0, - 7.56256082439209, - 5.540261147481582, - 4.832322929216881, - 4.62003193872216, - 6.498056792320361, - 4.309846252268695, - 6.317531174829905, - 6.016362684141827, - 5.6469232737388815, - 7.56256082439209, - 0.0, - 5.974878731322299, - 4.898357301336036, - 6.442097410320605, - 5.227077347287883, - 7.134101195584642, - 5.457753923371659, - 7.0, - 6.373112846266441, - 5.540261147481582, - 5.974878731322299, - 0.0, - 5.5507273748583, - 4.897749658726415, - 9.0, - 8.398776718824767, - 3.908281400328807, - 4.83431066343688, - 4.0212880272531715, - 4.832322929216881, - 4.898357301336036, - 5.5507273748583, - 0.0, - 6.632989819428174, - 7.438852294822894, - 5.6631570310967465, - 7.579428202635459, - 6.760811985364303, - 6.916281504639404, - 4.62003193872216, - 6.442097410320605, - 4.897749658726415, - 6.632989819428174, - 0.0, - 5.249404187382862, - 6.072559523278559, - 4.07661278488929, - 6.19678948003145, - 5.741508386786526, - 6.498056792320361, - 5.227077347287883, - 9.0, - 7.438852294822894, - 5.249404187382862, - 0.0, - 3.854811639654704, - 6.652724827169063, - 5.298236851430971, - 5.411470999663036, - 4.309846252268695, - 7.134101195584642, - 8.398776718824767, - 5.6631570310967465, - 6.072559523278559, - 3.854811639654704, - 0.0, - 7.529184598969917, - 6.903282911791188, - 9.0, - 6.317531174829905, - 5.457753923371659, - 3.908281400328807, - 7.579428202635459, - 4.07661278488929, - 6.652724827169063, - 7.529184598969917, - 0.0, - 7.0, - 4.977014354725805, - 6.016362684141827, - 7.0, - 4.83431066343688, - 6.760811985364303, - 6.19678948003145, - 5.298236851430971, - 6.903282911791188, - 7.0, - 0.0}, - raft::distance::DistanceType::Canberra, - 0.0}; - -const InputConfiguration input_lp_unexpanded = { - 10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, - 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, - 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, - 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, - 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, - 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 1.31462855332296, - 1.3690307816129905, - 1.698603990921237, - 1.3460470789553531, - 1.6636670712582544, - 1.2651744044972217, - 1.1938329352055201, - 1.8811409082590185, - 1.3653115050624267, - 1.31462855332296, - 0.0, - 1.9447722703291133, - 1.42818777206562, - 1.4685491458946494, - 1.3071999866010466, - 1.4988622861692171, - 0.9698559287406783, - 1.4972023224597841, - 1.5243383567266802, - 1.3690307816129905, - 1.9447722703291133, - 0.0, - 1.2748400840107568, - 1.0599569946448246, - 1.546591282841402, - 1.147526531928459, - 1.447002179128145, - 1.5982242387673176, - 1.3112533607072414, - 1.698603990921237, - 1.42818777206562, - 1.2748400840107568, - 0.0, - 1.038121552545461, - 1.011788365364402, - 1.3907391109256988, - 1.3128200942311496, - 1.19595706584447, - 1.3233328139624725, - 1.3460470789553531, - 1.4685491458946494, - 1.0599569946448246, - 1.038121552545461, - 0.0, - 1.3642741698145529, - 1.3493868683808095, - 1.394942694628328, - 1.572881849642552, - 1.380122665319464, - 1.6636670712582544, - 1.3071999866010466, - 1.546591282841402, - 1.011788365364402, - 1.3642741698145529, - 0.0, - 1.018961640373018, - 1.0114394258945634, - 0.8338711034820684, - 1.1247823842299223, - 1.2651744044972217, - 1.4988622861692171, - 1.147526531928459, - 1.3907391109256988, - 1.3493868683808095, - 1.018961640373018, - 0.0, - 0.7701238110357329, - 1.245486437864406, - 0.5551259549534626, - 1.1938329352055201, - 0.9698559287406783, - 1.447002179128145, - 1.3128200942311496, - 1.394942694628328, - 1.0114394258945634, - 0.7701238110357329, - 0.0, - 1.1886800117391216, - 1.0083692448135637, - 1.8811409082590185, - 1.4972023224597841, - 1.5982242387673176, - 1.19595706584447, - 1.572881849642552, - 0.8338711034820684, - 1.245486437864406, - 1.1886800117391216, - 0.0, - 1.3661374102525012, - 1.3653115050624267, - 1.5243383567266802, - 1.3112533607072414, - 1.3233328139624725, - 1.380122665319464, - 1.1247823842299223, - 0.5551259549534626, - 1.0083692448135637, - 1.3661374102525012, - 0.0}, - raft::distance::DistanceType::LpUnexpanded, - 2.0}; - -const InputConfiguration input_linf = { - 10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, - 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, - 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, - 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, - 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, - 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 0.9251771844789913, - 0.9036452083899731, - 0.9251771844789913, - 0.8706483735804971, - 0.9251771844789913, - 0.717493881903289, - 0.6920214832303888, - 0.9251771844789913, - 0.9251771844789913, - 0.9251771844789913, - 0.0, - 0.9036452083899731, - 0.8655339692155823, - 0.8706483735804971, - 0.8655339692155823, - 0.8655339692155823, - 0.6329837991017668, - 0.8655339692155823, - 0.8655339692155823, - 0.9036452083899731, - 0.9036452083899731, - 0.0, - 0.7988276152181608, - 0.7028075145996631, - 0.9036452083899731, - 0.9036452083899731, - 0.9036452083899731, - 0.8429599432532096, - 0.9036452083899731, - 0.9251771844789913, - 0.8655339692155823, - 0.7988276152181608, - 0.0, - 0.48376552205293305, - 0.8206394616536681, - 0.8206394616536681, - 0.8206394616536681, - 0.8429599432532096, - 0.8206394616536681, - 0.8706483735804971, - 0.8706483735804971, - 0.7028075145996631, - 0.48376552205293305, - 0.0, - 0.8706483735804971, - 0.8706483735804971, - 0.8706483735804971, - 0.8429599432532096, - 0.8706483735804971, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.0, - 0.8853924473642432, - 0.535821510936138, - 0.6497196601457607, - 0.8853924473642432, - 0.717493881903289, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.0, - 0.5279604218147174, - 0.6658348373853169, - 0.33799874888632914, - 0.6920214832303888, - 0.6329837991017668, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.535821510936138, - 0.5279604218147174, - 0.0, - 0.662579808115858, - 0.5079750812968089, - 0.9251771844789913, - 0.8655339692155823, - 0.8429599432532096, - 0.8429599432532096, - 0.8429599432532096, - 0.6497196601457607, - 0.6658348373853169, - 0.662579808115858, - 0.0, - 0.8429599432532096, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.33799874888632914, - 0.5079750812968089, - 0.8429599432532096, - 0.0}, - raft::distance::DistanceType::Linf, +const InputConfiguration input_canberra = + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + raft::distance::DistanceType::Canberra, + 0.0}; + +const InputConfiguration input_lp_unexpanded = + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + raft::distance::DistanceType::LpUnexpanded, + 2.0}; + +const InputConfiguration input_linf = + {10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + raft::distance::DistanceType::Linf, + 0.0}; + +const InputConfiguration input_l1 = { + 4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + raft::distance::DistanceType::L1, 0.0}; -const InputConfiguration input_l1 = {4, - {0, 1, 1, 2, 4}, - {3, 2, 0, 1}, // indices - {0.99296, 0.42180, 0.11687, 0.305869}, - { - // dense output - 0.0, - 0.99296, - 1.41476, - 1.415707, - 0.99296, - 0.0, - 0.42180, - 0.42274, - 1.41476, - 0.42180, - 0.0, - 0.84454, - 1.41570, - 0.42274, - 0.84454, - 0.0, - }, - raft::distance::DistanceType::L1, - 0.0}; - // test dense smem strategy -const std::vector> - inputs_dense_strategy = {{input_inner_product}, - {input_l2_unexpanded}, - {input_canberra}, - {input_lp_unexpanded}, - {input_linf}, - {input_l1}}; +const std::vector< + SparseDistanceCOOSPMVInputs> + inputs_dense_strategy = {{input_inner_product}, {input_l2_unexpanded}, + {input_canberra}, {input_lp_unexpanded}, + {input_linf}, {input_l1}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestDenseStrategyF; @@ -664,22 +662,22 @@ INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests, ::testing::ValuesIn(inputs_dense_strategy)); // test hash and chunk strategy -const std::vector> inputs_hash_strategy = { - {input_inner_product}, - {input_inner_product, 0.5, 2}, - {input_l2_unexpanded}, - {input_l2_unexpanded, 0.5, 2}, - {input_canberra}, - {input_canberra, 0.5, 2}, - {input_canberra, 0.5, 6}, - {input_lp_unexpanded}, - {input_lp_unexpanded, 0.5, 2}, - {input_lp_unexpanded, 0.5, 6}, - {input_linf}, - {input_linf, 0.5, 2}, - {input_linf, 0.5, 6}, - {input_l1}, - {input_l1, 0.5, 2}}; +const std::vector> + inputs_hash_strategy = {{input_inner_product}, + {input_inner_product, 0.5, 2}, + {input_l2_unexpanded}, + {input_l2_unexpanded, 0.5, 2}, + {input_canberra}, + {input_canberra, 0.5, 2}, + {input_canberra, 0.5, 6}, + {input_lp_unexpanded}, + {input_lp_unexpanded, 0.5, 2}, + {input_lp_unexpanded, 0.5, 6}, + {input_linf}, + {input_linf, 0.5, 2}, + {input_linf, 0.5, 6}, + {input_l1}, + {input_l1, 0.5, 2}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestHashStrategyF; diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 8d6675f954..0589637061 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -50,8 +50,8 @@ struct SparseDistanceInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs& dims) -{ +::std::ostream &operator<<( + ::std::ostream &os, const SparseDistanceInputs &dims) { return os; } @@ -61,24 +61,24 @@ class SparseDistanceTest public: SparseDistanceTest() : dist_config(handle) {} - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = ::testing::TestWithParam< + SparseDistanceInputs>::GetParam(); make_data(); - dist_config.b_nrows = params.indptr_h.size() - 1; - dist_config.b_ncols = params.n_cols; - dist_config.b_nnz = params.indices_h.size(); - dist_config.b_indptr = indptr; + dist_config.b_nrows = params.indptr_h.size() - 1; + dist_config.b_ncols = params.n_cols; + dist_config.b_nnz = params.indices_h.size(); + dist_config.b_indptr = indptr; dist_config.b_indices = indices; - dist_config.b_data = data; - dist_config.a_nrows = params.indptr_h.size() - 1; - dist_config.a_ncols = params.n_cols; - dist_config.a_nnz = params.indices_h.size(); - dist_config.a_indptr = indptr; + dist_config.b_data = data; + dist_config.a_nrows = params.indptr_h.size() - 1; + dist_config.a_ncols = params.n_cols; + dist_config.a_nnz = params.indices_h.size(); + dist_config.a_indptr = indptr; dist_config.a_indices = indices; - dist_config.a_data = data; + dist_config.a_data = data; int out_size = dist_config.a_nrows * dist_config.b_nrows; @@ -89,8 +89,7 @@ class SparseDistanceTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -99,34 +98,33 @@ class SparseDistanceTest CUDA_CHECK(cudaFree(out_dists_ref)); } - void compare() - { - ASSERT_TRUE(devArrMatch( - out_dists_ref, out_dists, params.out_dists_ref_h.size(), CompareApprox(1e-3))); + void compare() { + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, + params.out_dists_ref_h.size(), + CompareApprox(1e-3))); } protected: - void make_data() - { - std::vector indptr_h = params.indptr_h; + void make_data() { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); allocate(data, data_h.size()); - update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); - update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream()); + update_device(indptr, indptr_h.data(), indptr_h.size(), + handle.get_stream()); + update_device(indices, indices_h.data(), indices_h.size(), + handle.get_stream()); update_device(data, data_h.data(), data_h.size(), handle.get_stream()); std::vector out_dists_ref_h = params.out_dists_ref_h; allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1)); - update_device(out_dists_ref, - out_dists_ref_h.data(), - out_dists_ref_h.size(), + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), dist_config.handle.get_stream()); } @@ -134,7 +132,7 @@ class SparseDistanceTest // input data value_idx *indptr, *indices; - value_t* data; + value_t *data; // output data value_t *out_dists, *out_dists_ref; @@ -189,7 +187,8 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}, {2, @@ -220,33 +219,40 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, - 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, - 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, - 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, - 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, - 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102, - 1., 0.76978799, 0.39419924, 0., 0.97577154, 0.48904013, 0.48300801, 0.45087445, - 0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, - 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., 0.79593037, 0.48904013, - 0.51413997, 0., 0.28605559, 0.35772784, 1., 0.60889396, 0.43324829, 0.84923694, - 0.45658883, 0.48300801, 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, - 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0., - 0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1., - 0.6745457, 0.77917274, 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, - 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., 0.51360432, 0.68185144, - 1., 0.54847744, 0.8321819, 0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432, - 0., 1., 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, + 0.58146987, 0.44940102, 1., 0.76978799, 0.39419924, 0., + 0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481, + 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, + 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., + 0.79593037, 0.48904013, 0.51413997, 0., 0.28605559, 0.35772784, + 1., 0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801, + 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, + 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, + 0.58623212, 0., 0.77917274, 0.48390993, 0.24558392, 0.99166225, + 0.58146987, 0.73323749, 0.67534399, 1., 0.6745457, 0.77917274, + 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, + 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., + 0.51360432, 0.68185144, 1., 0.54847744, 0.8321819, 0.43324829, + 0.67676228, 0.24558392, 0.76064776, 0.51360432, 0., 1., + 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, 0.61547536, 0.68185144, 1., 0.}, raft::distance::DistanceType::CosineExpanded, 0.0}, {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, - 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, @@ -355,13 +361,15 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, - 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, - 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, - 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, - 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, - 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 3.3954660629919076, 5.6469232737388815, @@ -467,13 +475,15 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, - 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, - 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, - 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, - 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, - 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 1.31462855332296, 1.3690307816129905, @@ -579,13 +589,15 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, - 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, - 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, - 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, - 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, - 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, + 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, + 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, + 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, + 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, + 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, + 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, + 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 0.9251771844789913, 0.9036452083899731, @@ -691,14 +703,17 @@ const std::vector> inputs_i32_f = { {15, {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45}, - {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, 0, 3, 7, 8, 12, 0, 2, 5, - 7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, - {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219, - 1., 0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246, - 0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, - 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739, - 0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409, 0.31461499, 0.24412279, 0.08327667, - 0.04444576, 0.05047969, 0.26190054, 0.2077349, 0.10803964}, + {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, + 0, 3, 7, 8, 12, 0, 2, 5, 7, 8, 14, 4, 9, 10, 11, + 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, + {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, + 0.73789274, 0.08450219, 1., 0.20184723, 0.18036963, 0.12581403, + 0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555, + 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, + 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, + 0.15605804, 0.3867739, 0.24908977, 0.36413632, 0.37643732, 0.28910679, + 0.0198409, 0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969, + 0.26190054, 0.2077349, 0.10803964}, {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01, 9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00, 6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08, @@ -757,25 +772,31 @@ const std::vector> inputs_i32_f = { {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45}, {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4}, - {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667, - 0.34426657, 0.2357925, 0.01274851, 0.11422017, 0.3437756, 0.31967718, 0.5956055, 0.31610373, - 0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, - 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736, - 0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815, - 0.31648823, 0.89874295, 0.27366735, 0.5119944, 0.11416134}, + {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, + 0.99584118, 0.76835667, 0.34426657, 0.2357925, 0.01274851, 0.11422017, + 0.3437756, 0.31967718, 0.5956055, 0.31610373, 0.04147273, 0.03724415, + 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, + 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, + 0.61364678, 0.22837736, 0.56609561, 0.29809423, 0.76736686, 0.56460608, + 0.98165371, 0.02140123, 0.19881268, 0.26057815, 0.31648823, 0.89874295, + 0.27366735, 0.5119944, 0.11416134}, {// dense output - 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, 0.76962708, 1.122858, - 1.1232498, 1.08166081, 0.48769777, 0., 1.31332116, 0.98318907, 0.42661815, 0.09279052, - 1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, - 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907, - 1.82943642, 0., 0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116, - 0.26657011, 0.42661815, 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, - 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, 0.45060069, 0., - 1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281, - 0.77814948, 1.29899154, 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, - 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., 1.47318624, 1.92660889, - 1.1232498, 0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624, - 0., 0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, + 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, + 0.76962708, 1.122858, 1.1232498, 1.08166081, 0.48769777, 0., + 1.31332116, 0.98318907, 0.42661815, 0.09279052, 1.35187836, 1.38429055, + 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, + 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, + 0.26127048, 0.98318907, 1.82943642, 0., 0.29945563, 1.08494093, + 0.22934281, 0.82801925, 1.74288748, 1.50610116, 0.26657011, 0.42661815, + 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, + 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, + 0.45060069, 0., 1.29899154, 1.40683824, 0.48505269, 0.53862363, + 0.76962708, 1.35187836, 1.59360067, 0.22934281, 0.77814948, 1.29899154, + 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, + 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., + 1.47318624, 1.92660889, 1.1232498, 0.40658897, 0.60215168, 1.74288748, + 1.18328348, 0.48505269, 1.92108999, 1.47318624, 0., 0.24992619, + 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, 1.88812175, 1.92660889, 0.24992619, 0.}, raft::distance::DistanceType::CorrelationExpanded, 0.0}, @@ -784,11 +805,12 @@ const std::vector> inputs_i32_f = { {1, 4, 0, 4, 1, 3, 0, 1, 3, 0}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., 1., 1., 1., - 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., - 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.8, 1., 1., - 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., - 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, + 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., + 1., 1., 1., 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., + 1., 1., 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., + 1., 1., 1., 1., 0., 1., 0.8, 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., + 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., 1., 1., 1., 1., 1., + 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, raft::distance::DistanceType::RusselRaoExpanded, 0.0}, {5, @@ -796,12 +818,13 @@ const std::vector> inputs_i32_f = { {0, 3, 4, 4, 2, 3, 0, 2, 3, 2}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, - 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0., 0.4, 0., - 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, - 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, - 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, - 0.2, 0.2, 0.4, 0., 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, + 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, + 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, + 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., + 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., + 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, + 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, 0.2, 0.2, 0.4, 0., 0.2, + 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, raft::distance::DistanceType::HammingUnexpanded, 0.0}, {3, @@ -845,8 +868,7 @@ const std::vector> inputs_i32_f = { typedef SparseDistanceTest SparseDistanceTestF; TEST_P(SparseDistanceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseDistanceTests, - SparseDistanceTestF, +INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF, ::testing::ValuesIn(inputs_i32_f)); }; // namespace distance diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index 02be95c8a8..f7954f899f 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -36,7 +36,8 @@ struct SparseFilterInputs { }; template -class SparseFilterTests : public ::testing::TestWithParam> { +class SparseFilterTests + : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -49,14 +50,14 @@ class SparseFilterTests : public ::testing::TestWithParam> const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseFilterTests COORemoveZeros; -TEST_P(COORemoveZeros, Result) -{ +TEST_P(COORemoveZeros, Result) { cudaStream_t stream; cudaStreamCreate(&stream); - std::shared_ptr alloc(new raft::mr::device::default_allocator); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); params = ::testing::TestWithParam>::GetParam(); - float* in_h_vals = new float[params.nnz]; + float *in_h_vals = new float[params.nnz]; COO in(alloc, stream, params.nnz, 5, 5); @@ -69,8 +70,8 @@ TEST_P(COORemoveZeros, Result) in_h_vals[2] = 0; in_h_vals[3] = 0; - int* in_h_rows = new int[params.nnz]; - int* in_h_cols = new int[params.nnz]; + int *in_h_rows = new int[params.nnz]; + int *in_h_cols = new int[params.nnz]; for (int i = 0; i < params.nnz; i++) { in_h_rows[i] = params.nnz - i - 1; @@ -86,9 +87,9 @@ TEST_P(COORemoveZeros, Result) int out_rows_ref_h[2] = {0, 3}; int out_cols_ref_h[2] = {4, 1}; - float* out_vals_ref_h = (float*)malloc(2 * sizeof(float)); - out_vals_ref_h[0] = in_h_vals[4]; - out_vals_ref_h[1] = in_h_vals[1]; + float *out_vals_ref_h = (float *)malloc(2 * sizeof(float)); + out_vals_ref_h[0] = in_h_vals[4]; + out_vals_ref_h[1] = in_h_vals[1]; COO out_ref(alloc, stream, 2, 5, 5); COO out(alloc, stream); @@ -99,9 +100,12 @@ TEST_P(COORemoveZeros, Result) op::coo_remove_zeros<32, float>(&in, &out, alloc, stream); - ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, + raft::Compare())); CUDA_CHECK(cudaStreamDestroy(stream)); free(out_vals_ref_h); @@ -111,7 +115,8 @@ TEST_P(COORemoveZeros, Result) delete[] in_h_vals; } -INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, + ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index ca9da0bc05..8c3bf36318 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -50,53 +50,39 @@ struct SparseKNNInputs { int batch_size_index = 2; int batch_size_query = 2; - raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded; + raft::distance::DistanceType metric = + raft::distance::DistanceType::L2SqrtExpanded; }; template -::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const SparseKNNInputs &dims) { return os; } template -class SparseKNNTest : public ::testing::TestWithParam> { +class SparseKNNTest + : public ::testing::TestWithParam> { public: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = + ::testing::TestWithParam>::GetParam(); n_rows = params.indptr_h.size() - 1; - nnz = params.indices_h.size(); - k = params.k; + nnz = params.indices_h.size(); + k = params.k; make_data(); - raft::sparse::selection::brute_force_knn(indptr, - indices, - data, - nnz, - n_rows, - params.n_cols, - indptr, - indices, - data, - nnz, - n_rows, - params.n_cols, - out_indices, - out_dists, - k, - handle, - params.batch_size_index, - params.batch_size_query, - params.metric); + raft::sparse::selection::brute_force_knn( + indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data, + nnz, n_rows, params.n_cols, out_indices, out_dists, k, handle, + params.batch_size_index, params.batch_size_query, params.metric); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); CUDA_CHECK(cudaFree(data)); @@ -106,37 +92,39 @@ class SparseKNNTest : public ::testing::TestWithParam(1e-4))); - ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare())); + void compare() { + ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, + CompareApprox(1e-4))); + ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, + Compare())); } protected: - void make_data() - { - std::vector indptr_h = params.indptr_h; + void make_data() { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); allocate(data, data_h.size()); - update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream()); - update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream()); + update_device(indptr, indptr_h.data(), indptr_h.size(), + handle.get_stream()); + update_device(indices, indices_h.data(), indices_h.size(), + handle.get_stream()); update_device(data, data_h.data(), data_h.size(), handle.get_stream()); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_dists_ref, out_dists_ref_h.size()); - update_device( - out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), handle.get_stream()); - update_device( - out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream()); + update_device(out_indices_ref, out_indices_ref_h.data(), + out_indices_ref_h.size(), handle.get_stream()); + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + handle.get_stream()); allocate(out_dists, n_rows * k); allocate(out_indices, n_rows * k); @@ -148,14 +136,14 @@ class SparseKNNTest : public ::testing::TestWithParam params; }; @@ -173,7 +161,8 @@ const std::vector> inputs_i32_f = { raft::distance::DistanceType::L2SqrtExpanded}}; typedef SparseKNNTest SparseKNNTestF; TEST_P(SparseKNNTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, + ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection }; // end namespace sparse diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index f660e68aa3..ec41b32374 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -29,9 +29,8 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry( - value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) -{ +__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, + value_idx nnz, value_idx *sum) { int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -51,21 +50,22 @@ struct KNNGraphInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const KNNGraphInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const KNNGraphInputs &dims) { return os; } template -class KNNGraphTest : public ::testing::TestWithParam> { - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); +class KNNGraphTest + : public ::testing::TestWithParam> { + void SetUp() override { + params = + ::testing::TestWithParam>::GetParam(); raft::handle_t handle; auto alloc = handle.get_device_allocator(); - stream = handle.get_stream(); + stream = handle.get_stream(); out = new raft::sparse::COO(alloc, stream); @@ -74,7 +74,8 @@ class KNNGraphTest : public ::testing::TestWithParam sum(1, stream); @@ -90,8 +91,7 @@ class KNNGraphTest : public ::testing::TestWithParam* out; + raft::sparse::COO *out; - value_t* X; + value_t *X; value_idx sum_h; @@ -115,15 +115,13 @@ const std::vector> knn_graph_inputs_fint = { {4, 2, {0, 100, 0.01, 0.02, 5000, 10000, -5, -2}, 2}}; typedef KNNGraphTest KNNGraphTestF_int; -TEST_P(KNNGraphTestF_int, Result) -{ +TEST_P(KNNGraphTestF_int, Result) { // nnz should not be larger than twice m * k ASSERT_TRUE(out->nnz <= (params.m * params.k * 2)); ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(KNNGraphTest, - KNNGraphTestF_int, +INSTANTIATE_TEST_CASE_P(KNNGraphTest, KNNGraphTestF_int, ::testing::ValuesIn(knn_graph_inputs_fint)); } // namespace sparse diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index 0ca7cec4e9..ce567e4298 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -55,44 +55,45 @@ struct LinkageInputs { * @param b: number of pairs of points that both the clusters have classified differently */ template -__global__ void computeTheNumerator( - const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b) -{ - // calculating the indices of pairs of datapoints compared by the current thread +__global__ void computeTheNumerator(const T* firstClusterArray, + const T* secondClusterArray, uint64_t size, + uint64_t* a, uint64_t* b) { + //calculating the indices of pairs of datapoints compared by the current thread uint64_t j = threadIdx.x + blockIdx.x * blockDim.x; uint64_t i = threadIdx.y + blockIdx.y * blockDim.y; - // thread-local variables to count a and b + //thread-local variables to count a and b uint64_t myA = 0, myB = 0; if (i < size && j < size && j < i) { - // checking if the pair have been classified the same by both the clusters + //checking if the pair have been classified the same by both the clusters if (firstClusterArray[i] == firstClusterArray[j] && secondClusterArray[i] == secondClusterArray[j]) { ++myA; } - // checking if the pair have been classified differently by both the clusters + //checking if the pair have been classified differently by both the clusters else if (firstClusterArray[i] != firstClusterArray[j] && secondClusterArray[i] != secondClusterArray[j]) { ++myB; } } - // specialize blockReduce for a 2D block of 1024 threads of type uint64_t - typedef cub::BlockReduce + //specialize blockReduce for a 2D block of 1024 threads of type uint64_t + typedef cub::BlockReduce BlockReduce; - // Allocate shared memory for blockReduce + //Allocate shared memory for blockReduce __shared__ typename BlockReduce::TempStorage temp_storage; - // summing up thread-local counts specific to a block + //summing up thread-local counts specific to a block myA = BlockReduce(temp_storage).Sum(myA); __syncthreads(); myB = BlockReduce(temp_storage).Sum(myB); __syncthreads(); - // executed once per block + //executed once per block if (threadIdx.x == 0 && threadIdx.y == 0) { raft::myAtomicAdd((unsigned long long int*)a, myA); raft::myAtomicAdd((unsigned long long int*)b, myB); @@ -100,105 +101,102 @@ __global__ void computeTheNumerator( } /** - * @brief Function to calculate RandIndex - * more info on rand index - * @param firstClusterArray: the array of classes of type T - * @param secondClusterArray: the array of classes of type T - * @param size: the size of the data points of type uint64_t - * @param allocator: object that takes care of temporary device memory allocation of type - * std::shared_ptr - * @param stream: the cudaStream object - */ +* @brief Function to calculate RandIndex +* more info on rand index +* @param firstClusterArray: the array of classes of type T +* @param secondClusterArray: the array of classes of type T +* @param size: the size of the data points of type uint64_t +* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr +* @param stream: the cudaStream object +*/ template -double compute_rand_index(T* firstClusterArray, - T* secondClusterArray, - uint64_t size, - std::shared_ptr allocator, - cudaStream_t stream) -{ - // rand index for size less than 2 is not defined +double compute_rand_index( + T* firstClusterArray, T* secondClusterArray, uint64_t size, + std::shared_ptr allocator, cudaStream_t stream) { + //rand index for size less than 2 is not defined ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); - // allocating and initializing memory for a and b in the GPU + //allocating and initializing memory for a and b in the GPU raft::mr::device::buffer arr_buf(allocator, stream, 2); CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); - // kernel configuration + //kernel configuration static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 numBlocks(raft::ceildiv(size, numThreadsPerBlock.x), raft::ceildiv(size, numThreadsPerBlock.y)); - // calling the kernel - computeTheNumerator<<>>( - firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1); + //calling the kernel + computeTheNumerator + <<>>( + firstClusterArray, secondClusterArray, size, arr_buf.data(), + arr_buf.data() + 1); - // synchronizing and updating the calculated values of a and b from device to host + //synchronizing and updating the calculated values of a and b from device to host uint64_t ab_host[2] = {0}; raft::update_host(ab_host, arr_buf.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - // error handling + //error handling CUDA_CHECK(cudaGetLastError()); - // denominator + //denominator uint64_t nChooseTwo = size * (size - 1) / 2; - // calculating the rand_index + //calculating the rand_index return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo); } template -::std::ostream& operator<<(::std::ostream& os, const LinkageInputs& dims) -{ +::std::ostream& operator<<(::std::ostream& os, + const LinkageInputs& dims) { return os; } template class LinkageTest : public ::testing::TestWithParam> { protected: - void basicTest() - { + void basicTest() { raft::handle_t handle; params = ::testing::TestWithParam>::GetParam(); - rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, + handle.get_stream()); // Allocate result labels and expected labels on device raft::allocate(labels, params.n_row); raft::allocate(labels_ref, params.n_row); - raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream()); - raft::copy(labels_ref, params.expected_labels.data(), params.n_row, handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), + handle.get_stream()); + raft::copy(labels_ref, params.expected_labels.data(), params.n_row, + handle.get_stream()); raft::hierarchy::linkage_output out_arrs; out_arrs.labels = labels; - rmm::device_uvector out_children(params.n_row * 2, handle.get_stream()); + rmm::device_uvector out_children(params.n_row * 2, + handle.get_stream()); out_arrs.children = out_children.data(); - raft::hierarchy::single_linkage( - handle, - data.data(), - params.n_row, - params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, - &out_arrs, - params.c, + raft::hierarchy::single_linkage< + IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>( + handle, data.data(), params.n_row, params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c, params.n_clusters); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); - score = compute_rand_index( - labels, labels_ref, params.n_row, handle.get_device_allocator(), handle.get_stream()); + score = + compute_rand_index(labels, labels_ref, params.n_row, + handle.get_device_allocator(), handle.get_stream()); } void SetUp() override { basicTest(); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(labels)); CUDA_CHECK(cudaFree(labels_ref)); } @@ -214,12 +212,14 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, - 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, - 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, - 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, - 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, + 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, + 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, + 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, + 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, + 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, + 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10, @@ -227,7 +227,8 @@ const std::vector> linkage_inputsf2 = { // // Test outlier points {9, 2, - {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5}, + {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, + 10, 50, 30, 5}, {6, 0, 5, 0, 0, 4, 3, 2, 1}, 7, -1}, @@ -235,12 +236,14 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == (n_points / 2) {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, - 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, - 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, - 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, - 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, + 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, + 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, + 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, + 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, + 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, + 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {1, 0, 4, 0, 0, 3, 2, 0, 2, 1}, 5, @@ -249,173 +252,340 @@ const std::vector> linkage_inputsf2 = { // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, + 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, + 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, + 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, + 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, + 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, + 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, + 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, + 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, + 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, + 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, + 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, + 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, + 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, + 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, + 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, + 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, + 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, + 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, + 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, + 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, + 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, + 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, + 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, + 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, + 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, + 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, + 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, + 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, + 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, + 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, + 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, + 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, + 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, + 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, + 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, + 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, + 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, + 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, + 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, + 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, + 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, + 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, + 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, + 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, + 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, + 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, + 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, + 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, + 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, + 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, + 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, + 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, + 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, + 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, + 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, + 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, + 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, + 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, + 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, + 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, + 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, + 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, + 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, + 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, + 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, + 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, + 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, + 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, + 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, + 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, + 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, + 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, + 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, + 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, + 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, + 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, + 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, + 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, + 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, + 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, + 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, + 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, + 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, + 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, + 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, + 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, + 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, + 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, + 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, + 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, + 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, + 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, + 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, + 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, + 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, + 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, + 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, + 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, + 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, + 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, + 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, + 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, + 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, + 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, + 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, + 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, + 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, + 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, + 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, + 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, + 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, + 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, + 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, + 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, + 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, + 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, + 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, + 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, + 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, + 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, + 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, + 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, + 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, + 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, + 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, + 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, + 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, + 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, + 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, + 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, + 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, + 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, + 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, + 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, + 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, + 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, + 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, + 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, + 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, + 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, + 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, + 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, + 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, + 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, + 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, + 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, + 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, + 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, + 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, + 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, + 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, + 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, + 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, + 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, + 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, + 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, + 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, + 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, + 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, + 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, + 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, + 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, + 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, + 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, + 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, + 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, + 8.66342445e-01 }, {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -428,5 +598,6 @@ const std::vector> linkage_inputsf2 = { typedef LinkageTest LinkageTestF_Int; TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); } -INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, ::testing::ValuesIn(linkage_inputsf2)); +INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, + ::testing::ValuesIn(linkage_inputsf2)); } // end namespace raft diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index 4897d8194b..7adbbf8b9a 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -39,11 +39,12 @@ struct CSRRowNormalizeInputs { }; template -class CSRRowNormalizeTest : public ::testing::TestWithParam> { +class CSRRowNormalizeTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = ::testing::TestWithParam< + CSRRowNormalizeInputs>::GetParam(); cudaStreamCreate(&stream); raft::allocate(in_vals, params.in_vals.size()); @@ -52,10 +53,9 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam(ex_scan, in_vals, nnz, n_rows, result, stream); + linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows, + result, stream); break; case L1: - linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows, result, stream); + linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows, + result, stream); break; } - ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify, result, nnz, raft::Compare())); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(ex_scan)); CUDA_CHECK(cudaFree(in_vals)); CUDA_CHECK(cudaFree(verify)); @@ -85,7 +87,7 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam params; cudaStream_t stream; - Index_* ex_scan; + Index_ *ex_scan; Type_f *in_vals, *result, *verify; }; @@ -116,11 +118,9 @@ const std::vector> csrnormalize_inputs_d = { {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseNormTest, - CSRRowNormalizeTestF, +INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF, ::testing::ValuesIn(csrnormalize_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseNormTest, - CSRRowNormalizeTestD, +INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD, ::testing::ValuesIn(csrnormalize_inputs_d)); } // namespace sparse diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu index 44098214d2..50b5dc5993 100644 --- a/cpp/test/sparse/reduce.cu +++ b/cpp/test/sparse/reduce.cu @@ -42,19 +42,19 @@ struct SparseReduceInputs { }; template -class SparseReduceTest : public ::testing::TestWithParam> { +class SparseReduceTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = ::testing::TestWithParam< + SparseReduceInputs>::GetParam(); } - void Run() - { + void Run() { raft::handle_t handle; auto d_alloc = handle.get_device_allocator(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); rmm::device_uvector in_rows(params.in_rows.size(), stream); rmm::device_uvector in_cols(params.in_cols.size(), stream); @@ -63,29 +63,30 @@ class SparseReduceTest : public ::testing::TestWithParam out_cols(params.out_cols.size(), stream); rmm::device_uvector out_vals(params.out_vals.size(), stream); - raft::update_device(in_rows.data(), params.in_rows.data(), params.in_rows.size(), stream); - raft::update_device(in_cols.data(), params.in_cols.data(), params.in_cols.size(), stream); - raft::update_device(in_vals.data(), params.in_vals.data(), params.in_vals.size(), stream); - raft::update_device(out_rows.data(), params.out_rows.data(), params.out_rows.size(), stream); - raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream); - raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream); + raft::update_device(in_rows.data(), params.in_rows.data(), + params.in_rows.size(), stream); + raft::update_device(in_cols.data(), params.in_cols.data(), + params.in_cols.size(), stream); + raft::update_device(in_vals.data(), params.in_vals.data(), + params.in_vals.size(), stream); + raft::update_device(out_rows.data(), params.out_rows.data(), + params.out_rows.size(), stream); + raft::update_device(out_cols.data(), params.out_cols.data(), + params.out_cols.size(), stream); + raft::update_device(out_vals.data(), params.out_vals.data(), + params.out_vals.size(), stream); raft::sparse::COO out(d_alloc, stream); - raft::sparse::op::max_duplicates(handle, - out, - in_rows.data(), - in_cols.data(), - in_vals.data(), - params.in_rows.size(), - params.m, - params.n); + raft::sparse::op::max_duplicates(handle, out, in_rows.data(), + in_cols.data(), in_vals.data(), + params.in_rows.size(), params.m, params.n); ASSERT_TRUE(raft::devArrMatch( out_rows.data(), out.rows(), out.nnz, raft::Compare())); ASSERT_TRUE(raft::devArrMatch( out_cols.data(), out.cols(), out.nnz, raft::Compare())); - ASSERT_TRUE( - raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, + raft::Compare())); } void TearDown() override {} @@ -114,8 +115,7 @@ const std::vector> max_reduce_inputs_f = { 4}, }; -INSTANTIATE_TEST_CASE_P(SparseReduceTest, - SparseReduceTestF, +INSTANTIATE_TEST_CASE_P(SparseReduceTest, SparseReduceTestF, ::testing::ValuesIn(max_reduce_inputs_f)); } // namespace sparse diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index feefa7baa3..b64fa25883 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -38,47 +38,43 @@ struct CSRRowOpInputs { /** Wrapper to call csr_row_op because the enclosing function of a __device__ * lambda cannot have private ot protected access within the class. */ template -void csr_row_op_wrapper( - const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream) -{ +void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz, + Type_f *result, cudaStream_t stream) { op::csr_row_op( - row_ind, - n_rows, - nnz, + row_ind, n_rows, nnz, [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) { - for (Index_ i = start_idx; i < stop_idx; i++) - result[i] = row; + for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row; }, stream); } template -class CSRRowOpTest : public ::testing::TestWithParam> { +class CSRRowOpTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = + ::testing::TestWithParam>::GetParam(); cudaStreamCreate(&stream); n_rows = params.ex_scan.size(); - nnz = params.verify.size(); + nnz = params.verify.size(); raft::allocate(verify, nnz); raft::allocate(ex_scan, n_rows); raft::allocate(result, nnz, true); } - void Run() - { + void Run() { raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream); raft::update_device(verify, params.verify.data(), nnz, stream); csr_row_op_wrapper(ex_scan, n_rows, nnz, result, stream); - ASSERT_TRUE(raft::devArrMatch(verify, result, nnz, raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify, result, nnz, raft::Compare())); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(ex_scan)); CUDA_CHECK(cudaFree(verify)); CUDA_CHECK(cudaFree(result)); @@ -89,7 +85,7 @@ class CSRRowOpTest : public ::testing::TestWithParam params; cudaStream_t stream; Index_ n_rows, nnz; - Index_* ex_scan; + Index_ *ex_scan; Type_f *result, *verify; }; @@ -106,8 +102,10 @@ const std::vector> csrrowop_inputs_d = { {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, ::testing::ValuesIn(csrrowop_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, ::testing::ValuesIn(csrrowop_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, + ::testing::ValuesIn(csrrowop_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, + ::testing::ValuesIn(csrrowop_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu index 5d3b2a8317..46f2f6a844 100644 --- a/cpp/test/sparse/selection.cu +++ b/cpp/test/sparse/selection.cu @@ -45,9 +45,8 @@ struct SparseSelectionInputs { }; template -::std::ostream& operator<<(::std::ostream& os, - const SparseSelectionInputs& dims) -{ +::std::ostream &operator<<( + ::std::ostream &os, const SparseSelectionInputs &dims) { return os; } @@ -55,8 +54,7 @@ template class SparseSelectionTest : public ::testing::TestWithParam> { protected: - void make_data() - { + void make_data() { std::vector dists_h = params.dists_h; allocate(dists, n_rows * n_cols); @@ -65,39 +63,42 @@ class SparseSelectionTest allocate(inds, n_rows * n_cols); iota_fill(inds, n_rows, n_cols, stream); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; allocate(out_indices_ref, out_indices_ref_h.size()); allocate(out_dists_ref, out_dists_ref_h.size()); - update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream); - update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), stream); + update_device(out_indices_ref, out_indices_ref_h.data(), + out_indices_ref_h.size(), stream); + update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), + stream); allocate(out_dists, n_rows * k); allocate(out_indices, n_rows * k); } - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); - std::shared_ptr alloc(new raft::mr::device::default_allocator); + void SetUp() override { + params = ::testing::TestWithParam< + SparseSelectionInputs>::GetParam(); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); CUDA_CHECK(cudaStreamCreate(&stream)); n_rows = params.n_rows; n_cols = params.n_cols; - k = params.k; + k = params.k; make_data(); - raft::sparse::selection::select_k( - dists, inds, n_rows, n_cols, out_dists, out_indices, params.select_min, k, stream); + raft::sparse::selection::select_k(dists, inds, n_rows, n_cols, out_dists, + out_indices, params.select_min, k, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(dists)); @@ -110,10 +111,11 @@ class SparseSelectionTest CUDA_CHECK(cudaStreamDestroy(stream)); } - void compare() - { - ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare())); - ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare())); + void compare() { + ASSERT_TRUE( + devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare())); + ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, + Compare())); } protected: @@ -122,15 +124,15 @@ class SparseSelectionTest int n_rows, n_cols, k; // input data - value_t* dists; - value_idx* inds; + value_t *dists; + value_idx *inds; // output data - value_idx* out_indices; - value_t* out_dists; + value_idx *out_indices; + value_t *out_dists; - value_idx* out_indices_ref; - value_t* out_dists_ref; + value_idx *out_indices_ref; + value_t *out_dists_ref; SparseSelectionInputs params; }; @@ -147,8 +149,7 @@ const std::vector> inputs_i32_f = { true}}; typedef SparseSelectionTest SparseSelectionTestF; TEST_P(SparseSelectionTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseSelectionTest, - SparseSelectionTestF, +INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index e154d19d34..b9a8b849eb 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -47,27 +47,27 @@ class SparseSortTest : public ::testing::TestWithParam> { const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseSortTest COOSort; -TEST_P(COOSort, Result) -{ +TEST_P(COOSort, Result) { int *in_rows, *in_cols, *verify; - float* in_vals; + float *in_vals; params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - std::shared_ptr alloc(new raft::mr::device::default_allocator); + std::shared_ptr alloc( + new raft::mr::device::default_allocator); raft::allocate(in_vals, params.nnz); r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream); - int* in_rows_h = (int*)malloc(params.nnz * sizeof(int)); - int* in_cols_h = (int*)malloc(params.nnz * sizeof(int)); - int* verify_h = (int*)malloc(params.nnz * sizeof(int)); + int *in_rows_h = (int *)malloc(params.nnz * sizeof(int)); + int *in_cols_h = (int *)malloc(params.nnz * sizeof(int)); + int *verify_h = (int *)malloc(params.nnz * sizeof(int)); for (int i = 0; i < params.nnz; i++) { in_rows_h[i] = params.nnz - i - 1; - verify_h[i] = i; + verify_h[i] = i; in_cols_h[i] = i; } @@ -80,9 +80,11 @@ TEST_P(COOSort, Result) raft::update_device(in_cols, in_cols_h, params.nnz, stream); raft::update_device(verify, verify_h, params.nnz, stream); - op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, stream); + op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, + stream); - ASSERT_TRUE(raft::devArrMatch(verify, in_rows, params.nnz, raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify, in_rows, params.nnz, raft::Compare())); delete[] in_rows_h; delete[] in_cols_h; diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index 6a66daa769..d104028d2b 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -29,9 +29,8 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry( - value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) -{ +__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, + value_idx nnz, value_idx *sum) { int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -50,21 +49,19 @@ struct SparseSymmetrizeInputs { }; template -::std::ostream& operator<<(::std::ostream& os, - const SparseSymmetrizeInputs& dims) -{ +::std::ostream &operator<<( + ::std::ostream &os, const SparseSymmetrizeInputs &dims) { return os; } template -class SparseSymmetrizeTest - : public ::testing::TestWithParam> { +class SparseSymmetrizeTest : public ::testing::TestWithParam< + SparseSymmetrizeInputs> { protected: - void make_data() - { - std::vector indptr_h = params.indptr_h; + void make_data() { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; allocate(indptr, indptr_h.size()); allocate(indices, indices_h.size()); @@ -75,19 +72,19 @@ class SparseSymmetrizeTest update_device(data, data_h.data(), data_h.size(), stream); } - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = ::testing::TestWithParam< + SparseSymmetrizeInputs>::GetParam(); raft::handle_t handle; auto alloc = handle.get_device_allocator(); - stream = handle.get_stream(); + stream = handle.get_stream(); make_data(); - value_idx m = params.indptr_h.size() - 1; - value_idx n = params.n_cols; + value_idx m = params.indptr_h.size() - 1; + value_idx n = params.n_cols; value_idx nnz = params.indices_h.size(); raft::mr::device::buffer coo_rows(alloc, stream, nnz); @@ -96,8 +93,8 @@ class SparseSymmetrizeTest raft::sparse::COO out(alloc, stream); - raft::sparse::linalg::symmetrize( - handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out); + raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m, + n, coo_rows.size(), out); raft::mr::device::buffer sum(alloc, stream, 1); @@ -110,8 +107,7 @@ class SparseSymmetrizeTest CUDA_CHECK(cudaStreamSynchronize(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaStreamSynchronize(stream)); CUDA_CHECK(cudaFree(indptr)); CUDA_CHECK(cudaFree(indices)); @@ -123,7 +119,7 @@ class SparseSymmetrizeTest // input data value_idx *indptr, *indices; - value_t* data; + value_t *data; value_idx sum_h; @@ -137,7 +133,8 @@ struct COOSymmetrizeInputs { }; template -class COOSymmetrizeTest : public ::testing::TestWithParam> { +class COOSymmetrizeTest + : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -147,8 +144,7 @@ class COOSymmetrizeTest : public ::testing::TestWithParam const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef COOSymmetrizeTest COOSymmetrize; -TEST_P(COOSymmetrize, Result) -{ +TEST_P(COOSymmetrize, Result) { cudaStream_t stream; cudaStreamCreate(&stream); @@ -157,14 +153,16 @@ TEST_P(COOSymmetrize, Result) int nnz = 8; - int* in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int* in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; - float* in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; + int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; + float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; - int* exp_rows_h = new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; - int* exp_cols_h = new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; - float* exp_vals_h = - new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; + int *exp_rows_h = + new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; + int *exp_cols_h = + new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; + float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, + 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; COO in(alloc, stream, nnz, 4, 4); raft::update_device(in.rows(), *&in_rows_h, nnz, stream); @@ -174,19 +172,22 @@ TEST_P(COOSymmetrize, Result) COO out(alloc, stream); linalg::coo_symmetrize<32, float>( - &in, - &out, - [] __device__(int row, int col, float val, float trans) { return val + trans; }, - alloc, - stream); + &in, &out, + [] __device__(int row, int col, float val, float trans) { + return val + trans; + }, + alloc, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); std::cout << out << std::endl; ASSERT_TRUE(out.nnz == nnz * 2); - ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, + raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, + raft::Compare())); cudaStreamDestroy(stream); @@ -199,7 +200,8 @@ TEST_P(COOSymmetrize, Result) delete[] exp_vals_h; } -INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, + ::testing::ValuesIn(inputsf)); const std::vector> symm_inputs_fint = { // Test n_clusters == n_points @@ -219,8 +221,7 @@ const std::vector> symm_inputs_fint = { typedef SparseSymmetrizeTest SparseSymmetrizeTestF_int; TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, - SparseSymmetrizeTestF_int, +INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, SparseSymmetrizeTestF_int, ::testing::ValuesIn(symm_inputs_fint)); } // namespace sparse diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index 8d35960d6a..def1f1685b 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -29,8 +29,7 @@ namespace knn { template class HaversineKNNTest : public ::testing::Test { protected: - void basicTest() - { + void basicTest() { auto alloc = std::make_shared(); // Allocate input @@ -45,37 +44,31 @@ class HaversineKNNTest : public ::testing::Test { raft::allocate(d_pred_D, n * n); // make testdata on host - std::vector h_train_inputs = {0.71113885, - -1.29215058, - 0.59613176, - -2.08048115, - 0.74932804, - -1.33634042, - 0.51486728, - -1.65962873, - 0.53154002, - -1.47049808, - 0.72891737, - -1.54095137}; + std::vector h_train_inputs = { + 0.71113885, -1.29215058, 0.59613176, -2.08048115, + 0.74932804, -1.33634042, 0.51486728, -1.65962873, + 0.53154002, -1.47049808, 0.72891737, -1.54095137}; h_train_inputs.resize(n); raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0); - std::vector h_res_D = {0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, - 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, - 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, - 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, - 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, - 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; + std::vector h_res_D = { + 0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, + 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, + 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, + 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, + 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, + 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; h_res_D.resize(n * n); raft::update_device(d_ref_D, h_res_D.data(), n * n, 0); - std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1, - 3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; + std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, + 2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1, + 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; h_res_I.resize(n * n); raft::update_device(d_ref_I, h_res_I.data(), n * n, 0); - std::vector input_vec = {d_train_inputs}; + std::vector input_vec = {d_train_inputs}; std::vector sizes_vec = {n}; cudaStream_t stream; @@ -89,8 +82,7 @@ class HaversineKNNTest : public ::testing::Test { void SetUp() override { basicTest(); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(d_train_inputs)); CUDA_CHECK(cudaFree(d_pred_I)); CUDA_CHECK(cudaFree(d_pred_D)); @@ -99,26 +91,27 @@ class HaversineKNNTest : public ::testing::Test { } protected: - value_t* d_train_inputs; + value_t *d_train_inputs; int n = 6; int d = 2; int k = 6; - value_idx* d_pred_I; - value_t* d_pred_D; + value_idx *d_pred_I; + value_t *d_pred_D; - value_idx* d_ref_I; - value_t* d_ref_D; + value_idx *d_ref_I; + value_t *d_ref_D; }; typedef HaversineKNNTest HaversineKNNTestF; -TEST_F(HaversineKNNTestF, Fit) -{ - ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n, raft::CompareApprox(1e-3))); - ASSERT_TRUE(raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare())); +TEST_F(HaversineKNNTestF, Fit) { + ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n, + raft::CompareApprox(1e-3))); + ASSERT_TRUE( + raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare())); } } // namespace knn diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index d4e35c9d54..2b1ef89f7a 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -31,18 +31,18 @@ struct KNNInputs { std::vector labels; }; -__global__ void build_actual_output( - int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices) -{ +__global__ void build_actual_output(int *output, int n_rows, int k, + const int *idx_labels, + const int64_t *indices) { int element = threadIdx.x + blockDim.x * blockIdx.x; if (element >= n_rows * k) return; - int ind = (int)indices[element]; + int ind = (int)indices[element]; output[element] = idx_labels[ind]; } -__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels) -{ +__global__ void build_expected_output(int *output, int n_rows, int k, + const int *labels) { int row = threadIdx.x + blockDim.x * blockIdx.x; if (row >= n_rows) return; @@ -55,33 +55,25 @@ __global__ void build_expected_output(int* output, int n_rows, int k, const int* template class KNNTest : public ::testing::TestWithParam { protected: - void testBruteForce() - { - raft::print_device_vector("Input array: ", input_, rows_ * cols_, std::cout); + void testBruteForce() { + raft::print_device_vector("Input array: ", input_, rows_ * cols_, + std::cout); std::cout << "K: " << k_ << "\n"; - raft::print_device_vector("Labels array: ", search_labels_, rows_, std::cout); + raft::print_device_vector("Labels array: ", search_labels_, rows_, + std::cout); auto stream = handle_.get_stream(); raft::allocate(actual_labels_, rows_ * k_, true); raft::allocate(expected_labels_, rows_ * k_, true); - std::vector input_vec; + std::vector input_vec; std::vector sizes_vec; input_vec.push_back(input_); sizes_vec.push_back(rows_); - brute_force_knn(handle_, - input_vec, - sizes_vec, - cols_, - search_data_, - rows_, - indices_, - distances_, - k_, - true, - true); + brute_force_knn(handle_, input_vec, sizes_vec, cols_, search_data_, rows_, + indices_, distances_, k_, true, true); build_actual_output<<>>( actual_labels_, rows_, k_, search_labels_, indices_); @@ -89,20 +81,24 @@ class KNNTest : public ::testing::TestWithParam { build_expected_output<<>>( expected_labels_, rows_, k_, search_labels_); - raft::print_device_vector("Output indices: ", indices_, rows_ * k_, std::cout); - raft::print_device_vector("Output distances: ", distances_, rows_ * k_, std::cout); - raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_, std::cout); - raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, std::cout); - - ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_, raft::Compare())); + raft::print_device_vector("Output indices: ", indices_, rows_ * k_, + std::cout); + raft::print_device_vector("Output distances: ", distances_, rows_ * k_, + std::cout); + raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_, + std::cout); + raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, + std::cout); + + ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_, + raft::Compare())); } - void SetUp() override - { + void SetUp() override { params_ = ::testing::TestWithParam::GetParam(); - rows_ = params_.input.size(); - cols_ = params_.input[0].size(); - k_ = params_.k; + rows_ = params_.input.size(); + cols_ = params_.input[0].size(); + k_ = params_.k; std::vector row_major_input; for (int i = 0; i < params_.input.size(); ++i) { @@ -111,12 +107,14 @@ class KNNTest : public ::testing::TestWithParam { } } rmm::device_buffer input_d = rmm::device_buffer( - row_major_input.data(), row_major_input.size() * sizeof(float), handle_.get_stream()); - float* input_ptr = static_cast(input_d.data()); + row_major_input.data(), row_major_input.size() * sizeof(float), + handle_.get_stream()); + float *input_ptr = static_cast(input_d.data()); rmm::device_buffer labels_d = rmm::device_buffer( - params_.labels.data(), params_.labels.size() * sizeof(int), handle_.get_stream()); - int* labels_ptr = static_cast(labels_d.data()); + params_.labels.data(), params_.labels.size() * sizeof(int), + handle_.get_stream()); + int *labels_ptr = static_cast(labels_d.data()); raft::allocate(input_, rows_ * cols_, true); raft::allocate(search_data_, rows_ * cols_, true); @@ -129,8 +127,7 @@ class KNNTest : public ::testing::TestWithParam { raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream()); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(search_data_)); CUDA_CHECK(cudaFree(indices_)); CUDA_CHECK(cudaFree(distances_)); @@ -142,15 +139,15 @@ class KNNTest : public ::testing::TestWithParam { KNNInputs params_; int rows_; int cols_; - float* input_; - float* search_data_; - int64_t* indices_; - float* distances_; + float *input_; + float *search_data_; + int64_t *indices_; + float *distances_; int k_; - int* search_labels_; - int* actual_labels_; - int* expected_labels_; + int *search_labels_; + int *actual_labels_; + int *expected_labels_; }; const std::vector inputs = { diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index 2d7d713717..e5c2d52764 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -32,8 +32,7 @@ struct csr_view_t { index_type number_of_edges; }; } // namespace -TEST(Raft, SpectralMatrices) -{ +TEST(Raft, SpectralMatrices) { using namespace matrix; using index_type = int; using value_type = double; @@ -50,18 +49,19 @@ TEST(Raft, SpectralMatrices) index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; sparse_matrix_t sm2{h, csr_v}; ASSERT_EQ(nullptr, sm1.row_offsets_); ASSERT_EQ(nullptr, sm2.row_offsets_); - auto stream = h.get_stream(); + auto stream = h.get_stream(); auto t_exe_pol = thrust::cuda::par.on(stream); auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, vs, nrows, nnz}; + laplacian_matrix_t lm1{h, t_exe_pol, ro, ci, + vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args @@ -71,7 +71,8 @@ TEST(Raft, SpectralMatrices) EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) { - modularity_matrix_t mm1{h, t_exe_pol, ro, ci, vs, nrows, nnz}; + modularity_matrix_t mm1{h, t_exe_pol, ro, ci, + vs, nrows, nnz}; }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 8eb2f91952..4a3b0ed196 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -35,16 +35,14 @@ struct MeanInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const MeanInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const MeanInputs &dims) { return os; } template class MeanTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -61,15 +59,13 @@ class MeanTest : public ::testing::TestWithParam> { meanSGtest(data, stream); } - void meanSGtest(T* data, cudaStream_t stream) - { + void meanSGtest(T *data, cudaStream_t stream) { int rows = params.rows, cols = params.cols; mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(mean_act)); } @@ -82,52 +78,52 @@ class MeanTest : public ::testing::TestWithParam> { // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the // measured mean (of a normal distribution) will fall outside of an epsilon of // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times) -const std::vector> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, - {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; - -const std::vector> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL}, - {0.15, 1.0, 1024, 64, true, false, 1234ULL}, - {0.15, 1.0, 1024, 128, true, false, 1234ULL}, - {0.15, 1.0, 1024, 256, true, false, 1234ULL}, - {0.15, -1.0, 1024, 32, false, false, 1234ULL}, - {0.15, -1.0, 1024, 64, false, false, 1234ULL}, - {0.15, -1.0, 1024, 128, false, false, 1234ULL}, - {0.15, -1.0, 1024, 256, false, false, 1234ULL}, - {0.15, 1.0, 1024, 32, true, true, 1234ULL}, - {0.15, 1.0, 1024, 64, true, true, 1234ULL}, - {0.15, 1.0, 1024, 128, true, true, 1234ULL}, - {0.15, 1.0, 1024, 256, true, true, 1234ULL}, - {0.15, -1.0, 1024, 32, false, true, 1234ULL}, - {0.15, -1.0, 1024, 64, false, true, 1234ULL}, - {0.15, -1.0, 1024, 128, false, true, 1234ULL}, - {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; +const std::vector> inputsf = { + {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, + {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; + +const std::vector> inputsd = { + {0.15, 1.0, 1024, 32, true, false, 1234ULL}, + {0.15, 1.0, 1024, 64, true, false, 1234ULL}, + {0.15, 1.0, 1024, 128, true, false, 1234ULL}, + {0.15, 1.0, 1024, 256, true, false, 1234ULL}, + {0.15, -1.0, 1024, 32, false, false, 1234ULL}, + {0.15, -1.0, 1024, 64, false, false, 1234ULL}, + {0.15, -1.0, 1024, 128, false, false, 1234ULL}, + {0.15, -1.0, 1024, 256, false, false, 1234ULL}, + {0.15, 1.0, 1024, 32, true, true, 1234ULL}, + {0.15, 1.0, 1024, 64, true, true, 1234ULL}, + {0.15, 1.0, 1024, 128, true, true, 1234ULL}, + {0.15, 1.0, 1024, 256, true, true, 1234ULL}, + {0.15, -1.0, 1024, 32, false, true, 1234ULL}, + {0.15, -1.0, 1024, 64, false, true, 1234ULL}, + {0.15, -1.0, 1024, 128, false, true, 1234ULL}, + {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; typedef MeanTest MeanTestF; -TEST_P(MeanTestF, Result) -{ - ASSERT_TRUE( - devArrMatch(params.mean, mean_act, params.cols, CompareApprox(params.tolerance))); +TEST_P(MeanTestF, Result) { + ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols, + CompareApprox(params.tolerance))); } typedef MeanTest MeanTestD; -TEST_P(MeanTestD, Result) -{ - ASSERT_TRUE( - devArrMatch(params.mean, mean_act, params.cols, CompareApprox(params.tolerance))); +TEST_P(MeanTestD, Result) { + ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols, + CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index 67df0def05..8b0d607561 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -34,16 +34,16 @@ struct MeanCenterInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, + const MeanCenterInputs &dims) { return os; } template -class MeanCenterTest : public ::testing::TestWithParam> { +class MeanCenterTest + : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); @@ -51,7 +51,7 @@ class MeanCenterTest : public ::testing::TestWithParam> inputsf_i32 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i32; -TEST_P(MeanCenterTestF_i32, Result) -{ - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i32, Result) { + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, + ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL}, @@ -137,11 +139,12 @@ const std::vector> inputsf_i64 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i64; -TEST_P(MeanCenterTestF_i64, Result) -{ - ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i64, Result) { + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, + ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -169,12 +172,12 @@ const std::vector> inputsd_i32 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i32; -TEST_P(MeanCenterTestD_i32, Result) -{ - ASSERT_TRUE( - devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i32, Result) { + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, + ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -202,12 +205,12 @@ const std::vector> inputsd_i64 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i64; -TEST_P(MeanCenterTestD_i64, Result) -{ - ASSERT_TRUE( - devArrMatch(out, out_ref, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i64, Result) { + ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, + raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, + ::testing::ValuesIn(inputsd_i64)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index 8b7f75171b..ff2698788f 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -34,16 +34,14 @@ struct StdDevInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const StdDevInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const StdDevInputs &dims) { return os; } template class StdDevTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { + void SetUp() override { params = ::testing::TestWithParam>::GetParam(); random::Rng r(params.seed); int rows = params.rows, cols = params.cols; @@ -60,21 +58,21 @@ class StdDevTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void stdVarSGtest(T* data, cudaStream_t stream) - { + void stdVarSGtest(T *data, cudaStream_t stream) { int rows = params.rows, cols = params.cols; mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream); - stddev(stddev_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream); + stddev(stddev_act, data, mean_act, cols, rows, params.sample, + params.rowMajor, stream); - vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream); + vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor, + stream); raft::matrix::seqRoot(vars_act, T(1), cols, stream); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(mean_act)); CUDA_CHECK(cudaFree(stddev_act)); @@ -123,28 +121,28 @@ const std::vector> inputsd = { {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}}; typedef StdDevTest StdDevTestF; -TEST_P(StdDevTestF, Result) -{ - ASSERT_TRUE( - devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox(params.tolerance))); +TEST_P(StdDevTestF, Result) { + ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols, + CompareApprox(params.tolerance))); - ASSERT_TRUE( - devArrMatch(stddev_act, vars_act, params.cols, CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols, + CompareApprox(params.tolerance))); } typedef StdDevTest StdDevTestD; -TEST_P(StdDevTestD, Result) -{ - ASSERT_TRUE( - devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox(params.tolerance))); +TEST_P(StdDevTestD, Result) { + ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols, + CompareApprox(params.tolerance))); - ASSERT_TRUE( - devArrMatch(stddev_act, vars_act, params.cols, CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols, + CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, + ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, + ::testing::ValuesIn(inputsd)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index 89e81708cc..c3140d4588 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -32,17 +32,15 @@ struct SumInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SumInputs& dims) -{ +::std::ostream &operator<<(::std::ostream &os, const SumInputs &dims) { return os; } template class SumTest : public ::testing::TestWithParam> { protected: - void SetUp() override - { - params = ::testing::TestWithParam>::GetParam(); + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); int rows = params.rows, cols = params.cols; int len = rows * cols; cudaStream_t stream; @@ -61,8 +59,7 @@ class SumTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamDestroy(stream)); } - void TearDown() override - { + void TearDown() override { CUDA_CHECK(cudaFree(data)); CUDA_CHECK(cudaFree(sum_act)); } @@ -79,17 +76,15 @@ const std::vector> inputsd = {{0.05, 1024, 32, 1234ULL}, {0.05, 1024, 256, 1234ULL}}; typedef SumTest SumTestF; -TEST_P(SumTestF, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - float(params.rows), sum_act, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(SumTestF, Result) { + ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act, params.cols, + raft::CompareApprox(params.tolerance))); } typedef SumTest SumTestD; -TEST_P(SumTestD, Result) -{ - ASSERT_TRUE(raft::devArrMatch( - double(params.rows), sum_act, params.cols, raft::CompareApprox(params.tolerance))); +TEST_P(SumTestD, Result) { + ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act, params.cols, + raft::CompareApprox(params.tolerance))); } INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h index ca09d9c855..b8e8fe3fa0 100644 --- a/cpp/test/test_utils.h +++ b/cpp/test/test_utils.h @@ -25,16 +25,15 @@ namespace raft { template struct Compare { - bool operator()(const T& a, const T& b) const { return a == b; } + bool operator()(const T &a, const T &b) const { return a == b; } }; template struct CompareApprox { CompareApprox(T eps_) : eps(eps_) {} - bool operator()(const T& a, const T& b) const - { - T diff = abs(a - b); - T m = std::max(abs(a), abs(b)); + bool operator()(const T &a, const T &b) const { + T diff = abs(a - b); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); @@ -47,10 +46,9 @@ struct CompareApprox { template struct CompareApproxAbs { CompareApproxAbs(T eps_) : eps(eps_) {} - bool operator()(const T& a, const T& b) const - { - T diff = abs(abs(a) - abs(b)); - T m = std::max(abs(a), abs(b)); + bool operator()(const T &a, const T &b) const { + T diff = abs(abs(a) - abs(b)); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); } @@ -60,26 +58,25 @@ struct CompareApproxAbs { }; template -T abs(const T& a) -{ +T abs(const T &a) { return a > T(0) ? a : -a; } /* - * @brief Helper function to compare 2 device n-D arrays with custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value(s) - * @param actual actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - * @{ - */ + * @brief Helper function to compare 2 device n-D arrays with custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value(s) + * @param actual actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + * @{ + */ template -testing::AssertionResult devArrMatch( - const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) -{ +testing::AssertionResult devArrMatch(const T *expected, const T *actual, + size_t size, L eq_compare, + cudaStream_t stream = 0) { std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); raft::update_host(exp_h.get(), expected, size, stream); @@ -89,16 +86,16 @@ testing::AssertionResult devArrMatch( auto exp = exp_h.get()[i]; auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { - return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i; + return testing::AssertionFailure() + << "actual=" << act << " != expected=" << exp << " @" << i; } } return testing::AssertionSuccess(); } template -testing::AssertionResult devArrMatch( - T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) -{ +testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, + L eq_compare, cudaStream_t stream = 0) { std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -113,13 +110,9 @@ testing::AssertionResult devArrMatch( } template -testing::AssertionResult devArrMatch(const T* expected, - const T* actual, - size_t rows, - size_t cols, - L eq_compare, - cudaStream_t stream = 0) -{ +testing::AssertionResult devArrMatch(const T *expected, const T *actual, + size_t rows, size_t cols, L eq_compare, + cudaStream_t stream = 0) { size_t size = rows * cols; std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); @@ -133,7 +126,8 @@ testing::AssertionResult devArrMatch(const T* expected, auto act = act_h.get()[idx]; if (!eq_compare(exp, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << exp << " @" << i << "," << j; + << "actual=" << act << " != expected=" << exp << " @" << i << "," + << j; } } } @@ -141,9 +135,9 @@ testing::AssertionResult devArrMatch(const T* expected, } template -testing::AssertionResult devArrMatch( - T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) -{ +testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, + size_t cols, L eq_compare, + cudaStream_t stream = 0) { size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -154,7 +148,8 @@ testing::AssertionResult devArrMatch( auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i + << "," << j; } } } @@ -162,24 +157,24 @@ testing::AssertionResult devArrMatch( } /* - * @brief Helper function to compare a device n-D arrays with an expected array - * on the host, using a custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected_h host array of expected value(s) - * @param actual_d device array actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare a device n-D arrays with an expected array + * on the host, using a custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected_h host array of expected value(s) + * @param actual_d device array actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult devArrMatchHost( - const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0) -{ +testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, + size_t size, L eq_compare, + cudaStream_t stream = 0) { std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual_d, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - bool ok = true; + bool ok = true; auto fail = testing::AssertionFailure(); for (size_t i(0); i < size; ++i) { auto exp = expected_h[i]; @@ -194,19 +189,19 @@ testing::AssertionResult devArrMatchHost( } /* - * @brief Helper function to compare diagonal values of a 2D matrix - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value along diagonal - * @param actual actual matrix - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare diagonal values of a 2D matrix + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value along diagonal + * @param actual actual matrix + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult diagonalMatch( - T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) -{ +testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, + size_t cols, L eq_compare, + cudaStream_t stream = 0) { size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -218,7 +213,8 @@ testing::AssertionResult diagonalMatch( auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i + << "," << j; } } } @@ -226,10 +222,10 @@ testing::AssertionResult diagonalMatch( } template -testing::AssertionResult match(const T expected, T actual, L eq_compare) -{ +testing::AssertionResult match(const T expected, T actual, L eq_compare) { if (!eq_compare(expected, actual)) { - return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected; + return testing::AssertionFailure() + << "actual=" << actual << " != expected=" << expected; } return testing::AssertionSuccess(); } From fc7eba1c87363081c3060344e9f6949659ccb896 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Wed, 24 Nov 2021 18:01:20 -0500 Subject: [PATCH 4/5] Formatting changes --- cpp/include/raft.hpp | 3 +- cpp/include/raft/cache/cache_util.cuh | 105 +- cpp/include/raft/common/cub_wrappers.cuh | 42 +- .../raft/common/device_loads_stores.cuh | 87 +- cpp/include/raft/common/scatter.cuh | 77 +- cpp/include/raft/comms/comms.hpp | 359 ++-- cpp/include/raft/comms/helper.hpp | 35 +- cpp/include/raft/comms/mpi_comms.hpp | 314 ++-- cpp/include/raft/comms/std_comms.hpp | 327 ++-- cpp/include/raft/comms/test.hpp | 239 ++- cpp/include/raft/comms/ucp_helper.hpp | 138 +- cpp/include/raft/comms/util.hpp | 114 +- cpp/include/raft/cuda_utils.cuh | 259 ++- cpp/include/raft/cudart_utils.h | 238 +-- cpp/include/raft/device_atomics.cuh | 285 ++- cpp/include/raft/distance/detail/canberra.cuh | 136 +- .../raft/distance/detail/chebyshev.cuh | 136 +- .../raft/distance/detail/correlation.cuh | 255 ++- cpp/include/raft/distance/detail/cosine.cuh | 175 +- cpp/include/raft/distance/detail/distance.cuh | 698 +++++--- .../raft/distance/detail/euclidean.cuh | 314 ++-- .../raft/distance/detail/fused_l2_nn.cuh | 230 ++- cpp/include/raft/distance/detail/hamming.cuh | 155 +- .../raft/distance/detail/hellinger.cuh | 154 +- .../raft/distance/detail/jensen_shannon.cuh | 155 +- .../raft/distance/detail/kl_divergence.cuh | 253 ++- cpp/include/raft/distance/detail/l1.cuh | 128 +- .../raft/distance/detail/minkowski.cuh | 139 +- .../detail/pairwise_distance_base.cuh | 169 +- .../raft/distance/detail/russell_rao.cuh | 137 +- cpp/include/raft/distance/distance.hpp | 369 ++-- cpp/include/raft/distance/fused_l2_nn.hpp | 41 +- cpp/include/raft/error.hpp | 50 +- cpp/include/raft/handle.hpp | 120 +- cpp/include/raft/integer_utils.h | 55 +- cpp/include/raft/label/classlabels.cuh | 118 +- cpp/include/raft/label/merge_labels.cuh | 31 +- cpp/include/raft/lap/d_structs.h | 20 +- cpp/include/raft/lap/lap.cuh | 158 +- cpp/include/raft/lap/lap_functions.cuh | 366 ++-- cpp/include/raft/lap/lap_kernels.cuh | 343 ++-- cpp/include/raft/linalg/add.cuh | 35 +- cpp/include/raft/linalg/binary_op.cuh | 61 +- .../raft/linalg/cholesky_r1_update.cuh | 63 +- .../raft/linalg/coalesced_reduction.cuh | 55 +- cpp/include/raft/linalg/contractions.cuh | 79 +- cpp/include/raft/linalg/cublas_wrappers.h | 937 +++++++--- cpp/include/raft/linalg/cusolver_wrappers.h | 1317 ++++++++++---- cpp/include/raft/linalg/divide.cuh | 7 +- cpp/include/raft/linalg/eig.cuh | 214 ++- cpp/include/raft/linalg/eltwise.cuh | 56 +- cpp/include/raft/linalg/gemm.cuh | 85 +- cpp/include/raft/linalg/gemv.h | 88 +- cpp/include/raft/linalg/init.h | 6 +- cpp/include/raft/linalg/lanczos.hpp | 786 +++++--- cpp/include/raft/linalg/map.cuh | 31 +- cpp/include/raft/linalg/map_then_reduce.cuh | 92 +- cpp/include/raft/linalg/matrix_vector_op.cuh | 108 +- .../raft/linalg/mean_squared_error.cuh | 10 +- cpp/include/raft/linalg/multiply.cuh | 7 +- cpp/include/raft/linalg/norm.cuh | 92 +- cpp/include/raft/linalg/qr.cuh | 83 +- cpp/include/raft/linalg/reduce.cuh | 37 +- cpp/include/raft/linalg/strided_reduction.cuh | 74 +- cpp/include/raft/linalg/subtract.cuh | 34 +- cpp/include/raft/linalg/svd.cuh | 227 ++- cpp/include/raft/linalg/transpose.h | 60 +- cpp/include/raft/linalg/unary_op.cuh | 86 +- cpp/include/raft/matrix/detail/math.cuh | 35 +- cpp/include/raft/matrix/detail/matrix.cuh | 86 +- cpp/include/raft/matrix/math.hpp | 255 ++- cpp/include/raft/matrix/matrix.hpp | 160 +- cpp/include/raft/mr/buffer_base.hpp | 59 +- cpp/include/raft/mr/device/allocator.hpp | 9 +- cpp/include/raft/mr/device/buffer.hpp | 14 +- cpp/include/raft/mr/host/allocator.hpp | 13 +- cpp/include/raft/mr/host/buffer.hpp | 21 +- cpp/include/raft/pow2_utils.cuh | 47 +- cpp/include/raft/random/detail/rng_impl.cuh | 432 +++-- cpp/include/raft/random/rng.hpp | 101 +- cpp/include/raft/sparse/convert/coo.cuh | 20 +- cpp/include/raft/sparse/convert/csr.cuh | 114 +- cpp/include/raft/sparse/convert/dense.cuh | 35 +- cpp/include/raft/sparse/coo.cuh | 192 +- cpp/include/raft/sparse/csr.cuh | 129 +- cpp/include/raft/sparse/cusparse_wrappers.h | 1590 ++++++++++++----- cpp/include/raft/sparse/distance/common.h | 18 +- .../sparse/distance/detail/bin_distance.cuh | 187 +- .../raft/sparse/distance/detail/coo_spmv.cuh | 118 +- .../distance/detail/coo_spmv_kernel.cuh | 196 +- .../coo_spmv_strategies/base_strategy.cuh | 138 +- .../coo_mask_row_iterators.cuh | 166 +- .../dense_smem_strategy.cuh | 104 +- .../coo_spmv_strategies/hash_strategy.cuh | 277 +-- .../sparse/distance/detail/ip_distance.cuh | 39 +- .../sparse/distance/detail/l2_distance.cuh | 384 ++-- .../sparse/distance/detail/lp_distance.cuh | 199 ++- .../raft/sparse/distance/detail/operators.cuh | 29 +- .../raft/sparse/distance/detail/utils.cuh | 6 +- cpp/include/raft/sparse/distance/distance.hpp | 65 +- cpp/include/raft/sparse/hierarchy/common.h | 10 +- .../sparse/hierarchy/detail/agglomerative.cuh | 128 +- .../hierarchy/detail/connectivities.cuh | 83 +- .../raft/sparse/hierarchy/detail/mst.cuh | 84 +- .../raft/sparse/hierarchy/single_linkage.hpp | 62 +- cpp/include/raft/sparse/linalg/add.cuh | 116 +- cpp/include/raft/sparse/linalg/degree.cuh | 56 +- cpp/include/raft/sparse/linalg/norm.cuh | 51 +- cpp/include/raft/sparse/linalg/spectral.cuh | 65 +- cpp/include/raft/sparse/linalg/symmetrize.cuh | 154 +- cpp/include/raft/sparse/linalg/transpose.h | 60 +- .../raft/sparse/mst/detail/mst_kernels.cuh | 160 +- .../raft/sparse/mst/detail/mst_solver_inl.cuh | 258 +-- cpp/include/raft/sparse/mst/detail/utils.cuh | 19 +- cpp/include/raft/sparse/mst/mst.cuh | 34 +- cpp/include/raft/sparse/mst/mst_solver.cuh | 48 +- cpp/include/raft/sparse/op/filter.cuh | 105 +- cpp/include/raft/sparse/op/reduce.cuh | 54 +- cpp/include/raft/sparse/op/row_op.cuh | 16 +- cpp/include/raft/sparse/op/slice.h | 34 +- cpp/include/raft/sparse/op/sort.h | 21 +- .../sparse/selection/connect_components.cuh | 214 ++- cpp/include/raft/sparse/selection/knn.cuh | 441 +++-- .../raft/sparse/selection/knn_graph.cuh | 52 +- cpp/include/raft/sparse/utils.h | 22 +- cpp/include/raft/spatial/knn/ann.hpp | 31 +- cpp/include/raft/spatial/knn/ann_common.h | 10 +- cpp/include/raft/spatial/knn/ball_cover.hpp | 82 +- .../raft/spatial/knn/ball_cover_common.h | 37 +- .../knn/detail/ann_quantized_faiss.cuh | 130 +- .../raft/spatial/knn/detail/ball_cover.cuh | 322 ++-- .../spatial/knn/detail/ball_cover/common.cuh | 26 +- .../knn/detail/ball_cover/registers.cuh | 613 +++++-- .../spatial/knn/detail/block_select_faiss.cuh | 80 +- .../raft/spatial/knn/detail/common_faiss.h | 37 +- .../raft/spatial/knn/detail/fused_l2_knn.cuh | 802 ++++++--- .../spatial/knn/detail/haversine_distance.cuh | 56 +- .../knn/detail/knn_brute_force_faiss.cuh | 188 +- .../raft/spatial/knn/detail/processing.hpp | 121 +- .../spatial/knn/detail/selection_faiss.cuh | 99 +- .../spatial/knn/detail/warp_select_faiss.cuh | 276 +-- cpp/include/raft/spatial/knn/knn.hpp | 75 +- cpp/include/raft/spectral/cluster_solvers.hpp | 39 +- cpp/include/raft/spectral/eigen_solvers.hpp | 66 +- cpp/include/raft/spectral/kmeans.hpp | 402 +++-- cpp/include/raft/spectral/lapack.hpp | 552 ++++-- cpp/include/raft/spectral/matrix_wrappers.hpp | 260 +-- .../raft/spectral/modularity_maximization.hpp | 44 +- cpp/include/raft/spectral/partition.hpp | 53 +- cpp/include/raft/spectral/spectral_util.hpp | 118 +- cpp/include/raft/spectral/warn_dbg.hpp | 4 +- cpp/include/raft/stats/detail/mean.cuh | 42 +- cpp/include/raft/stats/detail/stddev.cuh | 136 +- cpp/include/raft/stats/detail/sum.cuh | 38 +- cpp/include/raft/stats/mean.hpp | 5 +- cpp/include/raft/stats/mean_center.hpp | 45 +- cpp/include/raft/stats/stddev.hpp | 22 +- cpp/include/raft/stats/sum.hpp | 4 +- cpp/include/raft/vectorized.cuh | 128 +- cpp/test/cluster_solvers.cu | 16 +- cpp/test/cudart_utils.cpp | 3 +- cpp/test/distance/dist_adj.cu | 94 +- cpp/test/distance/dist_canberra.cu | 24 +- cpp/test/distance/dist_chebyshev.cu | 24 +- cpp/test/distance/dist_correlation.cu | 24 +- cpp/test/distance/dist_cos.cu | 25 +- cpp/test/distance/dist_euc_exp.cu | 24 +- cpp/test/distance/dist_euc_unexp.cu | 20 +- cpp/test/distance/dist_hamming.cu | 24 +- cpp/test/distance/dist_hellinger.cu | 24 +- cpp/test/distance/dist_jensen_shannon.cu | 20 +- cpp/test/distance/dist_kl_divergence.cu | 20 +- cpp/test/distance/dist_l1.cu | 24 +- cpp/test/distance/dist_minkowski.cu | 23 +- cpp/test/distance/dist_russell_rao.cu | 24 +- cpp/test/distance/distance_base.cuh | 311 ++-- cpp/test/distance/fused_l2_nn.cu | 208 ++- cpp/test/eigen_solvers.cu | 31 +- cpp/test/handle.cpp | 18 +- cpp/test/integer_utils.cpp | 6 +- cpp/test/label/label.cu | 26 +- cpp/test/label/merge_labels.cu | 67 +- cpp/test/lap/lap.cu | 93 +- cpp/test/linalg/add.cu | 14 +- cpp/test/linalg/add.cuh | 17 +- cpp/test/linalg/binary_op.cu | 94 +- cpp/test/linalg/binary_op.cuh | 17 +- cpp/test/linalg/cholesky_r1.cu | 50 +- cpp/test/linalg/coalesced_reduction.cu | 64 +- cpp/test/linalg/divide.cu | 53 +- cpp/test/linalg/eig.cu | 206 ++- cpp/test/linalg/eig_sel.cu | 100 +- cpp/test/linalg/eltwise.cu | 104 +- cpp/test/linalg/gemm_layout.cu | 63 +- cpp/test/linalg/gemv.cu | 76 +- cpp/test/linalg/map.cu | 108 +- cpp/test/linalg/map_then_reduce.cu | 101 +- cpp/test/linalg/matrix_vector_op.cu | 128 +- cpp/test/linalg/matrix_vector_op.cuh | 73 +- cpp/test/linalg/multiply.cu | 33 +- cpp/test/linalg/norm.cu | 150 +- cpp/test/linalg/reduce.cu | 86 +- cpp/test/linalg/reduce.cuh | 51 +- cpp/test/linalg/strided_reduction.cu | 57 +- cpp/test/linalg/subtract.cu | 75 +- cpp/test/linalg/svd.cu | 120 +- cpp/test/linalg/transpose.cu | 63 +- cpp/test/linalg/unary_op.cu | 47 +- cpp/test/linalg/unary_op.cuh | 17 +- cpp/test/matrix/math.cu | 213 +-- cpp/test/matrix/matrix.cu | 81 +- cpp/test/mr/device/buffer.cpp | 16 +- cpp/test/mr/host/buffer.cpp | 9 +- cpp/test/mst.cu | 182 +- cpp/test/pow2_utils.cu | 28 +- cpp/test/random/rng.cu | 210 +-- cpp/test/random/rng_int.cu | 60 +- cpp/test/random/sample_without_replacement.cu | 42 +- cpp/test/sparse/add.cu | 118 +- cpp/test/sparse/connect_components.cu | 593 +++--- cpp/test/sparse/convert_coo.cu | 22 +- cpp/test/sparse/convert_csr.cu | 55 +- cpp/test/sparse/csr_row_slice.cu | 77 +- cpp/test/sparse/csr_to_dense.cu | 64 +- cpp/test/sparse/csr_transpose.cu | 70 +- cpp/test/sparse/degree.cu | 45 +- cpp/test/sparse/dist_coo_spmv.cu | 922 +++++----- cpp/test/sparse/distance.cu | 244 ++- cpp/test/sparse/filter.cu | 30 +- cpp/test/sparse/knn.cu | 78 +- cpp/test/sparse/knn_graph.cu | 32 +- cpp/test/sparse/linkage.cu | 632 +++---- cpp/test/sparse/norm.cu | 25 +- cpp/test/sparse/reduce.cu | 48 +- cpp/test/sparse/row_op.cu | 43 +- cpp/test/sparse/sort.cu | 19 +- cpp/test/sparse/symmetrize.cu | 86 +- cpp/test/spatial/ball_cover.cu | 200 ++- cpp/test/spatial/fused_l2_knn.cu | 108 +- cpp/test/spatial/haversine.cu | 71 +- cpp/test/spatial/knn.cu | 82 +- cpp/test/spatial/selection.cu | 55 +- cpp/test/spatial/spatial_data.h | 31 +- cpp/test/spectral_matrix.cu | 13 +- cpp/test/stats/mean.cu | 98 +- cpp/test/stats/mean_center.cu | 80 +- cpp/test/stats/stddev.cu | 52 +- cpp/test/stats/sum.cu | 23 +- cpp/test/test_utils.h | 143 +- 249 files changed, 19679 insertions(+), 13246 deletions(-) diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp index f380d276b2..08f836d3a8 100644 --- a/cpp/include/raft.hpp +++ b/cpp/include/raft.hpp @@ -21,7 +21,8 @@ namespace raft { /* Function for testing RAFT include * * @return message indicating RAFT has been included succesfully*/ -inline std::string test_raft() { +inline std::string test_raft() +{ std::string status = "RAFT Setup succesfully"; return status; } diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh index a65227c402..dc9327bb94 100644 --- a/cpp/include/raft/cache/cache_util.cuh +++ b/cpp/include/raft/cache/cache_util.cuh @@ -42,17 +42,16 @@ namespace cache { * @param [out] out vectors collected from the cache, size [n_vec * n] */ template -__global__ void get_vecs(const math_t *cache, int_t n_vec, - const idx_t *cache_idx, int_t n, math_t *out) { +__global__ void get_vecs( + const math_t* cache, int_t n_vec, const idx_t* cache_idx, int_t n, math_t* out) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - size_t out_col = tid / n_vec; // col idx + size_t out_col = tid / n_vec; // col idx size_t cache_col = cache_idx[out_col]; if (cache_idx[out_col] >= 0) { - if (row + out_col * n_vec < (size_t)n_vec * n) { - out[tid] = cache[row + cache_col * n_vec]; - } + if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; } } } } @@ -84,21 +83,26 @@ __global__ void get_vecs(const math_t *cache, int_t n_vec, * @param [in] n_cache_vecs */ template -__global__ void store_vecs(const math_t *tile, int n_tile, int n_vec, - const int *tile_idx, int n, const int *cache_idx, - math_t *cache, int n_cache_vecs) { +__global__ void store_vecs(const math_t* tile, + int n_tile, + int n_vec, + const int* tile_idx, + int n, + const int* cache_idx, + math_t* cache, + int n_cache_vecs) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; int row = tid % n_vec; // row idx if (tid < n_vec * n) { - int tile_col = tid / n_vec; // col idx - int data_col = tile_idx ? tile_idx[tile_col] : tile_col; + int tile_col = tid / n_vec; // col idx + int data_col = tile_idx ? tile_idx[tile_col] : tile_col; int cache_col = cache_idx[tile_col]; // We ignore negative values. The rest of the checks should be fulfilled // if the cache is used properly if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) { - cache[row + (size_t)cache_col * n_vec] = - tile[row + (size_t)data_col * n_vec]; + cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec]; } } } @@ -121,14 +125,15 @@ int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; } * @return the index of the first element in the array for which * array[idx] >= value. If there is no such value, then return n. */ -int DI arg_first_ge(const int *array, int n, int val) { +int DI arg_first_ge(const int* array, int n, int val) +{ int start = 0; - int end = n - 1; + int end = n - 1; if (array[0] == val) return 0; if (array[end] < val) return n; while (start + 1 < end) { int q = (start + end + 1) / 2; - //invariants: + // invariants: // start < end // start < q <=end // array[start] < val && array[end] <=val @@ -157,7 +162,8 @@ int DI arg_first_ge(const int *array, int n, int val) { * @return the idx of the k-th occurance of val in array, or -1 if * the value is not found. */ -int DI find_nth_occurrence(const int *array, int n, int val, int k) { +int DI find_nth_occurrence(const int* array, int n, int val, int k) +{ int q = arg_first_ge(array, n, val); if (q + k < n && array[q + k] == val) { q += k; @@ -196,10 +202,10 @@ int DI find_nth_occurrence(const int *array, int n, int val, int k) { * Each block should give a different pointer for rank. */ template -DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { +DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank) +{ const int items_per_thread = raft::ceildiv(associativity, nthreads); - typedef cub::BlockRadixSort - BlockRadixSort; + typedef cub::BlockRadixSort BlockRadixSort; __shared__ typename BlockRadixSort::TempStorage temp_storage; int key[items_per_thread]; @@ -208,8 +214,8 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { int block_offset = blockIdx.x * associativity; for (int j = 0; j < items_per_thread; j++) { - int k = threadIdx.x + j * nthreads; - int t = (k < associativity) ? cache_time[block_offset + k] : 32768; + int k = threadIdx.x + j * nthreads; + int t = (k < associativity) ? cache_time[block_offset + k] : 32768; key[j] = t; val[j] = k; } @@ -217,9 +223,7 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { BlockRadixSort(temp_storage).Sort(key, val); for (int j = 0; j < items_per_thread; j++) { - if (val[j] < associativity) { - rank[val[j]] = threadIdx.x * items_per_thread + j; - } + if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; } } __syncthreads(); } @@ -252,9 +256,15 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) { * not be cached, size [n] */ template -__global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, - int *cached_keys, int n_cache_sets, - int *cache_time, int time, int *cache_idx) { +__global__ void assign_cache_idx(const int* keys, + int n, + const int* cache_set, + int* cached_keys, + int n_cache_sets, + int* cache_time, + int time, + int* cache_idx) +{ int block_offset = blockIdx.x * associativity; const int items_per_thread = raft::ceildiv(associativity, nthreads); @@ -273,7 +283,7 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, // these elements are assigned -1. for (int j = 0; j < items_per_thread; j++) { - int i = threadIdx.x + j * nthreads; + int i = threadIdx.x + j * nthreads; int t_idx = block_offset + i; bool mask = (i < associativity); // whether this slot is available for writing @@ -284,10 +294,10 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set, if (mask) { int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]); if (k > -1) { - int key_val = keys[k]; + int key_val = keys[k]; cached_keys[t_idx] = key_val; - cache_idx[k] = t_idx; - cache_time[t_idx] = time; + cache_idx[k] = t_idx; + cache_time[t_idx] = time; } } } @@ -315,21 +325,28 @@ namespace { * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity] * @param [in] n_cache_sets number of cache sets * @param [in] associativity number of keys in cache set - * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * associativity] + * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * + * associativity] * @param [out] cache_idx cache indices of the working set elements, size [n] * @param [out] is_cached whether the element is cached size[n] * @param [in] time iteration counter (used for time stamping) */ -__global__ void get_cache_idx(int *keys, int n, int *cached_keys, - int n_cache_sets, int associativity, - int *cache_time, int *cache_idx, bool *is_cached, - int time) { +__global__ void get_cache_idx(int* keys, + int n, + int* cached_keys, + int n_cache_sets, + int associativity, + int* cache_time, + int* cache_idx, + bool* is_cached, + int time) +{ int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n) { - int widx = keys[tid]; - int sidx = hash(widx, n_cache_sets); - int cidx = sidx * associativity; - int i = 0; + int widx = keys[tid]; + int sidx = hash(widx, n_cache_sets); + int cidx = sidx * associativity; + int i = 0; bool found = false; // search for empty spot and the least recently used spot while (i < associativity && !found) { @@ -338,9 +355,9 @@ __global__ void get_cache_idx(int *keys, int n, int *cached_keys, } is_cached[tid] = found; if (found) { - cidx = cidx + i - 1; - cache_time[cidx] = time; //update time stamp - cache_idx[tid] = cidx; //exact cache idx + cidx = cidx + i - 1; + cache_time[cidx] = time; // update time stamp + cache_idx[tid] = cidx; // exact cache idx } else { cache_idx[tid] = sidx; // assign cache set } diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh index 8e3519fea5..32a46968b6 100644 --- a/cpp/include/raft/common/cub_wrappers.cuh +++ b/cpp/include/raft/common/cub_wrappers.cuh @@ -22,28 +22,32 @@ namespace raft { /** - * @brief Convenience wrapper over cub's SortPairs method - * @tparam KeyT key type - * @tparam ValueT value type - * @param workspace workspace buffer which will get resized if not enough space - * @param inKeys input keys array - * @param outKeys output keys array - * @param inVals input values array - * @param outVals output values array - * @param len array length - * @param stream cuda stream - */ + * @brief Convenience wrapper over cub's SortPairs method + * @tparam KeyT key type + * @tparam ValueT value type + * @param workspace workspace buffer which will get resized if not enough space + * @param inKeys input keys array + * @param outKeys output keys array + * @param inVals input values array + * @param outVals output values array + * @param len array length + * @param stream cuda stream + */ template -void sortPairs(rmm::device_uvector &workspace, const KeyT *inKeys, - KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len, - cudaStream_t stream) { +void sortPairs(rmm::device_uvector& workspace, + const KeyT* inKeys, + KeyT* outKeys, + const ValueT* inVals, + ValueT* outVals, + int len, + cudaStream_t stream) +{ size_t worksize; - cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals, - outVals, len, 0, sizeof(KeyT) * 8, stream); + cub::DeviceRadixSort::SortPairs( + nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); workspace.resize(worksize, stream); - cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys, - inVals, outVals, len, 0, sizeof(KeyT) * 8, - stream); + cub::DeviceRadixSort::SortPairs( + workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream); } } // namespace raft diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh index bb2b019ecb..41dc9cab08 100644 --- a/cpp/include/raft/common/device_loads_stores.cuh +++ b/cpp/include/raft/common/device_loads_stores.cuh @@ -31,40 +31,43 @@ namespace raft { * @param[out] addr shared memory address (should be aligned to vector size) * @param[in] x data to be stored at this address */ -DI void sts(float* addr, const float& x) { +DI void sts(float* addr, const float& x) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x)); } -DI void sts(float* addr, const float (&x)[1]) { +DI void sts(float* addr, const float (&x)[1]) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0])); } -DI void sts(float* addr, const float (&x)[2]) { +DI void sts(float* addr, const float (&x)[2]) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f32 [%0], {%1, %2};" - : - : "l"(s2), "f"(x[0]), "f"(x[1])); + asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1])); } -DI void sts(float* addr, const float (&x)[4]) { +DI void sts(float* addr, const float (&x)[4]) +{ auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};" : : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3])); } -DI void sts(double* addr, const double& x) { +DI void sts(double* addr, const double& x) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x)); } -DI void sts(double* addr, const double (&x)[1]) { +DI void sts(double* addr, const double (&x)[1]) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0])); } -DI void sts(double* addr, const double (&x)[2]) { +DI void sts(double* addr, const double (&x)[2]) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("st.shared.v2.f64 [%0], {%1, %2};" - : - : "l"(s2), "d"(x[0]), "d"(x[1])); + asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1])); } /** @} */ @@ -80,39 +83,42 @@ DI void sts(double* addr, const double (&x)[2]) { * @param[in] addr shared memory address from where to load * (should be aligned to vector size) */ -DI void lds(float& x, float* addr) { +DI void lds(float& x, float* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1)); } -DI void lds(float (&x)[1], float* addr) { +DI void lds(float (&x)[1], float* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1)); } -DI void lds(float (&x)[2], float* addr) { +DI void lds(float (&x)[2], float* addr) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" - : "=f"(x[0]), "=f"(x[1]) - : "l"(s2)); + asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2)); } -DI void lds(float (&x)[4], float* addr) { +DI void lds(float (&x)[4], float* addr) +{ auto s4 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(s4)); } -DI void lds(double& x, double* addr) { +DI void lds(double& x, double* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1)); } -DI void lds(double (&x)[1], double* addr) { +DI void lds(double (&x)[1], double* addr) +{ auto s1 = __cvta_generic_to_shared(reinterpret_cast(addr)); asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1)); } -DI void lds(double (&x)[2], double* addr) { +DI void lds(double (&x)[2], double* addr) +{ auto s2 = __cvta_generic_to_shared(reinterpret_cast(addr)); - asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" - : "=d"(x[0]), "=d"(x[1]) - : "l"(s2)); + asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2)); } /** @} */ @@ -123,32 +129,35 @@ DI void lds(double (&x)[2], double* addr) { * @param[out] x data to be loaded from global memory * @param[in] addr address in global memory from where to load */ -DI void ldg(float& x, const float* addr) { +DI void ldg(float& x, const float* addr) +{ asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr)); } -DI void ldg(float (&x)[1], const float* addr) { +DI void ldg(float (&x)[1], const float* addr) +{ asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr)); } -DI void ldg(float (&x)[2], const float* addr) { - asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" - : "=f"(x[0]), "=f"(x[1]) - : "l"(addr)); +DI void ldg(float (&x)[2], const float* addr) +{ + asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr)); } -DI void ldg(float (&x)[4], const float* addr) { +DI void ldg(float (&x)[4], const float* addr) +{ asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3]) : "l"(addr)); } -DI void ldg(double& x, const double* addr) { +DI void ldg(double& x, const double* addr) +{ asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr)); } -DI void ldg(double (&x)[1], const double* addr) { +DI void ldg(double (&x)[1], const double* addr) +{ asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr)); } -DI void ldg(double (&x)[2], const double* addr) { - asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" - : "=d"(x[0]), "=d"(x[1]) - : "l"(addr)); +DI void ldg(double (&x)[2], const double* addr) +{ + asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr)); } /** @} */ diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh index 785794461e..b228ac5499 100644 --- a/cpp/include/raft/common/scatter.cuh +++ b/cpp/include/raft/common/scatter.cuh @@ -22,8 +22,8 @@ namespace raft { template -__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, - IdxT len, Lambda op) { +__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op) +{ typedef TxN_t DataVec; typedef TxN_t IdxVec; IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x); @@ -34,61 +34,60 @@ __global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx, DataVec dataIn; #pragma unroll for (int i = 0; i < VecLen; ++i) { - auto inPos = idxIn.val.data[i]; + auto inPos = idxIn.val.data[i]; dataIn.val.data[i] = op(in[inPos], tid + i); } dataIn.store(out, tid); } template -void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len, - Lambda op, cudaStream_t stream) { +void scatterImpl( + DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream) +{ const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB); - scatterKernel - <<>>(out, in, idx, len, op); + scatterKernel<<>>(out, in, idx, len, op); CUDA_CHECK(cudaGetLastError()); } /** - * @brief Performs scatter operation based on the input indexing array - * @tparam DataT data type whose array gets scattered - * @tparam IdxT indexing type - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Lambda the device-lambda performing a unary operation on the loaded - * data before it gets scattered - * @param out the output array - * @param in the input array - * @param idx the indexing array - * @param len number of elements in the input array - * @param stream cuda stream where to launch work - * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This - * will be applied to every element before scattering it to the right location. - * The second param in this method will be the destination index. - */ -template , int TPB = 256> -void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len, - cudaStream_t stream, Lambda op = raft::Nop()) { + * @brief Performs scatter operation based on the input indexing array + * @tparam DataT data type whose array gets scattered + * @tparam IdxT indexing type + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Lambda the device-lambda performing a unary operation on the loaded + * data before it gets scattered + * @param out the output array + * @param in the input array + * @param idx the indexing array + * @param len number of elements in the input array + * @param stream cuda stream where to launch work + * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This + * will be applied to every element before scattering it to the right location. + * The second param in this method will be the destination index. + */ +template , int TPB = 256> +void scatter(DataT* out, + const DataT* in, + const IdxT* idx, + IdxT len, + cudaStream_t stream, + Lambda op = raft::Nop()) +{ if (len <= 0) return; - constexpr size_t DataSize = sizeof(DataT); - constexpr size_t IdxSize = sizeof(IdxT); + constexpr size_t DataSize = sizeof(DataT); + constexpr size_t IdxSize = sizeof(IdxT); constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize; - size_t bytes = len * MaxPerElem; + size_t bytes = len * MaxPerElem; if (16 / MaxPerElem && bytes % 16 == 0) { - scatterImpl(out, in, idx, len, - op, stream); + scatterImpl(out, in, idx, len, op, stream); } else if (8 / MaxPerElem && bytes % 8 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (4 / MaxPerElem && bytes % 4 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (2 / MaxPerElem && bytes % 2 == 0) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else if (1 / MaxPerElem) { - scatterImpl(out, in, idx, len, op, - stream); + scatterImpl(out, in, idx, len, op, stream); } else { scatterImpl(out, in, idx, len, op, stream); } diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp index bd8a4ce9e7..68b8e723e9 100644 --- a/cpp/include/raft/comms/comms.hpp +++ b/cpp/include/raft/comms/comms.hpp @@ -25,16 +25,7 @@ namespace raft { namespace comms { typedef unsigned int request_t; -enum class datatype_t { - CHAR, - UINT8, - INT32, - UINT32, - INT64, - UINT64, - FLOAT32, - FLOAT64 -}; +enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 }; enum class op_t { SUM, PROD, MIN, MAX }; /** @@ -50,42 +41,50 @@ template constexpr datatype_t get_type(); template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::CHAR; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT8; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::INT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::INT64; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::UINT64; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::FLOAT32; } template <> -constexpr datatype_t get_type() { +constexpr datatype_t get_type() +{ return datatype_t::FLOAT64; } @@ -95,76 +94,106 @@ class comms_iface { virtual int get_rank() const = 0; virtual std::unique_ptr comm_split(int color, int key) const = 0; - virtual void barrier() const = 0; + virtual void barrier() const = 0; virtual status_t sync_stream(cudaStream_t stream) const = 0; - virtual void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const = 0; + virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0; - virtual void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const = 0; + virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0; virtual void waitall(int count, request_t array_of_requests[]) const = 0; - virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, + virtual void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, cudaStream_t stream) const = 0; - virtual void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const = 0; + virtual void bcast( + void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0; - virtual void bcast(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, int root, + virtual void bcast(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, + virtual void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, cudaStream_t stream) const = 0; - virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const = 0; - - virtual void allgatherv(const void* sendbuf, void* recvbuf, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, cudaStream_t stream) const = 0; + virtual void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const = 0; - virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, int root, + virtual void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const = 0; + + virtual void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, int root, + virtual void gatherv(const void* sendbuf, + void* recvbuf, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, cudaStream_t stream) const = 0; - virtual void reducescatter(const void* sendbuff, void* recvbuff, - size_t recvcount, datatype_t datatype, op_t op, + virtual void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_send(const void* buf, size_t size, int dest, - cudaStream_t stream) const = 0; + virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - virtual void device_recv(void* buf, size_t size, int source, - cudaStream_t stream) const = 0; - - virtual void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, - void* recvbuf, size_t recvsize, int source, + virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0; + + virtual void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, cudaStream_t stream) const = 0; - virtual void device_multicast_sendrecv( - const void* sendbuf, std::vector const& sendsizes, - std::vector const& sendoffsets, std::vector const& dests, - void* recvbuf, std::vector const& recvsizes, - std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const = 0; + virtual void device_multicast_sendrecv(const void* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + void* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const = 0; }; class comms_t { public: - comms_t(std::unique_ptr impl) : impl_(impl.release()) { + comms_t(std::unique_ptr impl) : impl_(impl.release()) + { ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!"); } @@ -191,7 +220,8 @@ class comms_t { * @param color ranks w/ the same color are placed in the same communicator * @param key controls rank assignment */ - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { return impl_->comm_split(color, key); } @@ -208,9 +238,7 @@ class comms_t { * * @param stream the cuda stream to sync collective operations on */ - status_t sync_stream(cudaStream_t stream) const { - return impl_->sync_stream(stream); - } + status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); } /** * Performs an asynchronous point-to-point send @@ -223,10 +251,9 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void isend(const value_t* buf, size_t size, int dest, int tag, - request_t* request) const { - impl_->isend(static_cast(buf), size * sizeof(value_t), dest, - tag, request); + void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const + { + impl_->isend(static_cast(buf), size * sizeof(value_t), dest, tag, request); } /** @@ -240,10 +267,9 @@ class comms_t { * This will be used in `waitall()` to synchronize until the message is delivered (or fails). */ template - void irecv(value_t* buf, size_t size, int source, int tag, - request_t* request) const { - impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, - request); + void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const + { + impl_->irecv(static_cast(buf), size * sizeof(value_t), source, tag, request); } /** @@ -251,7 +277,8 @@ class comms_t { * @param count number of requests to synchronize on * @param array_of_requests an array of request_t objects returned from isend/irecv */ - void waitall(int count, request_t array_of_requests[]) const { + void waitall(int count, request_t array_of_requests[]) const + { impl_->waitall(count, array_of_requests); } @@ -265,11 +292,15 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count, - op_t op, cudaStream_t stream) const { + void allreduce( + const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const + { impl_->allreduce(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), - op, stream); + static_cast(recvbuff), + count, + get_type(), + op, + stream); } /** @@ -281,9 +312,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const { - impl_->bcast(static_cast(buff), count, get_type(), root, - stream); + void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const + { + impl_->bcast(static_cast(buff), count, get_type(), root, stream); } /** @@ -296,10 +327,14 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void bcast(const value_t* sendbuff, value_t* recvbuff, size_t count, int root, - cudaStream_t stream) const { + void bcast( + const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const + { impl_->bcast(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), root, + static_cast(recvbuff), + count, + get_type(), + root, stream); } @@ -314,11 +349,20 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, - int root, cudaStream_t stream) const { + void reduce(const value_t* sendbuff, + value_t* recvbuff, + size_t count, + op_t op, + int root, + cudaStream_t stream) const + { impl_->reduce(static_cast(sendbuff), - static_cast(recvbuff), count, get_type(), op, - root, stream); + static_cast(recvbuff), + count, + get_type(), + op, + root, + stream); } /** @@ -330,11 +374,16 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, - cudaStream_t stream) const { + void allgather(const value_t* sendbuff, + value_t* recvbuff, + size_t sendcount, + cudaStream_t stream) const + { impl_->allgather(static_cast(sendbuff), - static_cast(recvbuff), sendcount, - get_type(), stream); + static_cast(recvbuff), + sendcount, + get_type(), + stream); } /** @@ -349,12 +398,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void allgatherv(const value_t* sendbuf, value_t* recvbuf, - const size_t* recvcounts, const size_t* displs, - cudaStream_t stream) const { + void allgatherv(const value_t* sendbuf, + value_t* recvbuf, + const size_t* recvcounts, + const size_t* displs, + cudaStream_t stream) const + { impl_->allgatherv(static_cast(sendbuf), - static_cast(recvbuf), recvcounts, displs, - get_type(), stream); + static_cast(recvbuf), + recvcounts, + displs, + get_type(), + stream); } /** @@ -367,11 +422,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount, - int root, cudaStream_t stream) const { + void gather(const value_t* sendbuff, + value_t* recvbuff, + size_t sendcount, + int root, + cudaStream_t stream) const + { impl_->gather(static_cast(sendbuff), - static_cast(recvbuff), sendcount, get_type(), - root, stream); + static_cast(recvbuff), + sendcount, + get_type(), + root, + stream); } /** @@ -388,12 +450,22 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount, - const size_t* recvcounts, const size_t* displs, int root, - cudaStream_t stream) const { + void gatherv(const value_t* sendbuf, + value_t* recvbuf, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + int root, + cudaStream_t stream) const + { impl_->gatherv(static_cast(sendbuf), - static_cast(recvbuf), sendcount, recvcounts, displs, - get_type(), root, stream); + static_cast(recvbuf), + sendcount, + recvcounts, + displs, + get_type(), + root, + stream); } /** @@ -406,11 +478,18 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void reducescatter(const value_t* sendbuff, value_t* recvbuff, - size_t recvcount, op_t op, cudaStream_t stream) const { + void reducescatter(const value_t* sendbuff, + value_t* recvbuff, + size_t recvcount, + op_t op, + cudaStream_t stream) const + { impl_->reducescatter(static_cast(sendbuff), - static_cast(recvbuff), recvcount, - get_type(), op, stream); + static_cast(recvbuff), + recvcount, + get_type(), + op, + stream); } /** @@ -425,10 +504,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_send(const value_t* buf, size_t size, int dest, - cudaStream_t stream) const { - impl_->device_send(static_cast(buf), size * sizeof(value_t), - dest, stream); + void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const + { + impl_->device_send(static_cast(buf), size * sizeof(value_t), dest, stream); } /** @@ -443,10 +521,9 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_recv(value_t* buf, size_t size, int source, - cudaStream_t stream) const { - impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, - stream); + void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const + { + impl_->device_recv(static_cast(buf), size * sizeof(value_t), source, stream); } /** @@ -462,12 +539,21 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_sendrecv(const value_t* sendbuf, size_t sendsize, int dest, - value_t* recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { - impl_->device_sendrecv( - static_cast(sendbuf), sendsize * sizeof(value_t), dest, - static_cast(recvbuf), recvsize * sizeof(value_t), source, stream); + void device_sendrecv(const value_t* sendbuf, + size_t sendsize, + int dest, + value_t* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { + impl_->device_sendrecv(static_cast(sendbuf), + sendsize * sizeof(value_t), + dest, + static_cast(recvbuf), + recvsize * sizeof(value_t), + source, + stream); } /** @@ -485,28 +571,37 @@ class comms_t { * @param stream CUDA stream to synchronize operation */ template - void device_multicast_sendrecv( - const value_t* sendbuf, std::vector const& sendsizes, - std::vector const& sendoffsets, std::vector const& dests, - value_t* recvbuf, std::vector const& recvsizes, - std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const { - auto sendbytesizes = sendsizes; + void device_multicast_sendrecv(const value_t* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + value_t* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const + { + auto sendbytesizes = sendsizes; auto sendbyteoffsets = sendoffsets; for (size_t i = 0; i < sendsizes.size(); ++i) { sendbytesizes[i] *= sizeof(value_t); sendbyteoffsets[i] *= sizeof(value_t); } - auto recvbytesizes = recvsizes; + auto recvbytesizes = recvsizes; auto recvbyteoffsets = recvoffsets; for (size_t i = 0; i < recvsizes.size(); ++i) { recvbytesizes[i] *= sizeof(value_t); recvbyteoffsets[i] *= sizeof(value_t); } impl_->device_multicast_sendrecv(static_cast(sendbuf), - sendbytesizes, sendbyteoffsets, dests, - static_cast(recvbuf), recvbytesizes, - recvbyteoffsets, sources, stream); + sendbytesizes, + sendbyteoffsets, + dests, + static_cast(recvbuf), + recvbytesizes, + recvbyteoffsets, + sources, + stream); } private: diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp index e01490d728..2be5b0d23f 100644 --- a/cpp/include/raft/comms/helper.hpp +++ b/cpp/include/raft/comms/helper.hpp @@ -36,12 +36,12 @@ namespace comms { * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, - int num_ranks, int rank) { +void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank) +{ cudaStream_t stream = handle->get_stream(); - auto communicator = std::make_shared(std::unique_ptr( - new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream))); + auto communicator = std::make_shared( + std::unique_ptr(new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream))); handle->set_comms(communicator); } @@ -60,20 +60,20 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm, * @param num_ranks number of ranks in communicator clique * @param rank rank of local instance */ -void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, - void *ucp_worker, void *eps, int num_ranks, - int rank) { - auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); +void build_comms_nccl_ucx( + handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank) +{ + auto eps_sp = std::make_shared(new ucp_ep_h[num_ranks]); - auto size_t_ep_arr = reinterpret_cast(eps); + auto size_t_ep_arr = reinterpret_cast(eps); for (int i = 0; i < num_ranks; i++) { - size_t ptr = size_t_ep_arr[i]; - auto ucp_ep_v = reinterpret_cast(*eps_sp); + size_t ptr = size_t_ep_arr[i]; + auto ucp_ep_v = reinterpret_cast(*eps_sp); if (ptr != 0) { auto eps_ptr = reinterpret_cast(size_t_ep_arr[i]); - ucp_ep_v[i] = eps_ptr; + ucp_ep_v[i] = eps_ptr; } else { ucp_ep_v[i] = nullptr; } @@ -81,18 +81,19 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm, cudaStream_t stream = handle->get_stream(); - auto communicator = std::make_shared( - std::unique_ptr(new raft::comms::std_comms( + auto communicator = + std::make_shared(std::unique_ptr(new raft::comms::std_comms( nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream))); handle->set_comms(communicator); } -inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId, - int size) { +inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size) +{ memcpy(id->internal, uniqueId, size); } -inline void get_unique_id(char *uid, int size) { +inline void get_unique_id(char* uid, int size) +{ ncclUniqueId id; ncclGetUniqueId(&id); diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp index 067c7bd0ab..3091cd53a9 100644 --- a/cpp/include/raft/comms/mpi_comms.hpp +++ b/cpp/include/raft/comms/mpi_comms.hpp @@ -32,16 +32,16 @@ #include #include -#define MPI_TRY(call) \ - do { \ - int status = call; \ - if (MPI_SUCCESS != status) { \ - int mpi_error_string_lenght = 0; \ - char mpi_error_string[MPI_MAX_ERROR_STRING]; \ - MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - RAFT_EXPECTS(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \ - #call, mpi_error_string); \ - } \ +#define MPI_TRY(call) \ + do { \ + int status = call; \ + if (MPI_SUCCESS != status) { \ + int mpi_error_string_lenght = 0; \ + char mpi_error_string[MPI_MAX_ERROR_STRING]; \ + MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ + RAFT_EXPECTS( \ + MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \ + } \ } while (0) #define MPI_TRY_NO_THROW(call) \ @@ -51,48 +51,41 @@ int mpi_error_string_lenght = 0; \ char mpi_error_string[MPI_MAX_ERROR_STRING]; \ MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \ - printf("MPI call='%s' at file=%s line=%d failed with %s ", #call, \ - __FILE__, __LINE__, mpi_error_string); \ + printf("MPI call='%s' at file=%s line=%d failed with %s ", \ + #call, \ + __FILE__, \ + __LINE__, \ + mpi_error_string); \ } \ } while (0) namespace raft { namespace comms { -constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) { +constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return MPI_CHAR; - case datatype_t::UINT8: - return MPI_UNSIGNED_CHAR; - case datatype_t::INT32: - return MPI_INT; - case datatype_t::UINT32: - return MPI_UNSIGNED; - case datatype_t::INT64: - return MPI_LONG_LONG; - case datatype_t::UINT64: - return MPI_UNSIGNED_LONG_LONG; - case datatype_t::FLOAT32: - return MPI_FLOAT; - case datatype_t::FLOAT64: - return MPI_DOUBLE; + case datatype_t::CHAR: return MPI_CHAR; + case datatype_t::UINT8: return MPI_UNSIGNED_CHAR; + case datatype_t::INT32: return MPI_INT; + case datatype_t::UINT32: return MPI_UNSIGNED; + case datatype_t::INT64: return MPI_LONG_LONG; + case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG; + case datatype_t::FLOAT32: return MPI_FLOAT; + case datatype_t::FLOAT64: return MPI_DOUBLE; default: // Execution should never reach here. This takes care of compiler warning. return MPI_DOUBLE; } } -constexpr MPI_Op get_mpi_op(const op_t op) { +constexpr MPI_Op get_mpi_op(const op_t op) +{ switch (op) { - case op_t::SUM: - return MPI_SUM; - case op_t::PROD: - return MPI_PROD; - case op_t::MIN: - return MPI_MIN; - case op_t::MAX: - return MPI_MAX; + case op_t::SUM: return MPI_SUM; + case op_t::PROD: return MPI_PROD; + case op_t::MIN: return MPI_MIN; + case op_t::MAX: return MPI_MAX; default: // Execution should never reach here. This takes care of compiler warning. return MPI_MAX; @@ -102,38 +95,35 @@ constexpr MPI_Op get_mpi_op(const op_t op) { class mpi_comms : public comms_iface { public: mpi_comms(MPI_Comm comm, const bool owns_mpi_comm) - : owns_mpi_comm_(owns_mpi_comm), - mpi_comm_(comm), - size_(0), - rank_(1), - next_request_id_(0) { + : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0) + { int mpi_is_initialized = 0; MPI_TRY(MPI_Initialized(&mpi_is_initialized)); RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!"); MPI_TRY(MPI_Comm_size(mpi_comm_, &size_)); MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_)); - //get NCCL unique ID at rank 0 and broadcast it to all others + // get NCCL unique ID at rank 0 and broadcast it to all others ncclUniqueId id; if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id)); MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_)); - //initializing NCCL + // initializing NCCL NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_)); } - virtual ~mpi_comms() { - //finalizing NCCL + virtual ~mpi_comms() + { + // finalizing NCCL NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_)); - if (owns_mpi_comm_) { - MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); - } + if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); } } int get_size() const { return size_; } int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { MPI_Comm new_comm; MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm)); return std::unique_ptr(new mpi_comms(new_comm, true)); @@ -141,15 +131,15 @@ class mpi_comms : public comms_iface { void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); } - void isend(const void* buf, size_t size, int dest, int tag, - request_t* request) const { + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const + { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req)); @@ -157,15 +147,15 @@ class mpi_comms : public comms_iface { *request = req_id; } - void irecv(void* buf, size_t size, int source, int tag, - request_t* request) const { + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const + { MPI_Request mpi_req; request_t req_id; if (free_requests_.empty()) { req_id = next_request_id_++; } else { auto it = free_requests_.begin(); - req_id = *it; + req_id = *it; free_requests_.erase(it); } @@ -174,7 +164,8 @@ class mpi_comms : public comms_iface { *request = req_id; } - void waitall(int count, request_t array_of_requests[]) const { + void waitall(int count, request_t array_of_requests[]) const + { std::vector requests; requests.reserve(count); for (int i = 0; i < count; ++i) { @@ -189,94 +180,149 @@ class mpi_comms : public comms_iface { MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE)); } - void allreduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllReduce( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } - void bcast(void* buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const + { + NCCL_TRY( + ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void bcast(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), root, nccl_comm_, - stream)); + void bcast(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclBroadcast( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void reduce(const void* sendbuff, void* recvbuff, size_t count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), - get_nccl_op(op), root, nccl_comm_, stream)); + void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } - void allgather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllGather( + sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void* sendbuf, void* recvbuf, const size_t* recvcounts, - const size_t* displs, datatype_t datatype, - cudaStream_t stream) const { - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. + void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const + { + // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - + // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < size_; ++root) { - NCCL_TRY(ncclBroadcast(sendbuf, - static_cast(recvbuf) + - displs[root] * get_datatype_size(datatype), - recvcounts[root], get_nccl_datatype(datatype), - root, nccl_comm_, stream)); + NCCL_TRY( + ncclBroadcast(sendbuf, + static_cast(recvbuf) + displs[root] * get_datatype_size(datatype), + recvcounts[root], + get_nccl_datatype(datatype), + root, + nccl_comm_, + stream)); } } - void gather(const void* sendbuff, void* recvbuff, size_t sendcount, - datatype_t datatype, int root, cudaStream_t stream) const { + void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount, - const size_t* recvcounts, const size_t* displs, - datatype_t datatype, int root, cudaStream_t stream) const { + void gatherv(const void* sendbuff, + void* recvbuff, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, - recvcounts[r], get_nccl_datatype(datatype), r, - nccl_comm_, stream)); + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } - status_t sync_stream(cudaStream_t stream) const { + status_t sync_stream(cudaStream_t stream) const + { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -309,45 +355,58 @@ class mpi_comms : public comms_iface { }; // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void* buf, size_t size, int dest, - cudaStream_t stream) const { + void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const + { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void* buf, size_t size, int source, - cudaStream_t stream) const { + void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const + { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void* sendbuf, size_t sendsize, int dest, - void* recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { + void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY( - ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } void device_multicast_sendrecv(const void* sendbuf, std::vector const& sendsizes, std::vector const& sendoffsets, - std::vector const& dests, void* recvbuf, + std::vector const& dests, + void* recvbuf, std::vector const& recvsizes, std::vector const& recvoffsets, std::vector const& sources, - cudaStream_t stream) const { + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], ncclUint8, sources[i], nccl_comm_, + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -365,9 +424,10 @@ class mpi_comms : public comms_iface { mutable std::unordered_set free_requests_; }; -inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) { - auto communicator = std::make_shared( - std::unique_ptr(new mpi_comms(comm, true))); +inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) +{ + auto communicator = + std::make_shared(std::unique_ptr(new mpi_comms(comm, true))); handle->set_comms(communicator); }; diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp index 47559b1718..1647c29667 100644 --- a/cpp/include/raft/comms/std_comms.hpp +++ b/cpp/include/raft/comms/std_comms.hpp @@ -64,9 +64,13 @@ class std_comms : public comms_iface { * @param stream cuda stream for synchronizing and ordering collective operations * @param subcomms_ucp use ucp for subcommunicators */ - std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker, - std::shared_ptr eps, int num_ranks, int rank, - cudaStream_t stream, bool subcomms_ucp = true) + std_comms(ncclComm_t nccl_comm, + ucp_worker_h ucp_worker, + std::shared_ptr eps, + int num_ranks, + int rank, + cudaStream_t stream, + bool subcomms_ucp = true) : nccl_comm_(nccl_comm), stream_(stream), status_(2, stream), @@ -75,7 +79,8 @@ class std_comms : public comms_iface { subcomms_ucp_(subcomms_ucp), ucp_worker_(ucp_worker), ucp_eps_(eps), - next_request_id_(0) { + next_request_id_(0) + { initialize(); }; @@ -86,18 +91,19 @@ class std_comms : public comms_iface { * @param rank rank of the current worker * @param stream stream for ordering collective operations */ - std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, - cudaStream_t stream) + std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, cudaStream_t stream) : nccl_comm_(nccl_comm), stream_(stream), status_(2, stream), num_ranks_(num_ranks), rank_(rank), - subcomms_ucp_(false) { + subcomms_ucp_(false) + { initialize(); }; - void initialize() { + void initialize() + { sendbuff_ = status_.data(); recvbuff_ = status_.data() + 1; } @@ -106,17 +112,16 @@ class std_comms : public comms_iface { int get_rank() const { return rank_; } - std::unique_ptr comm_split(int color, int key) const { + std::unique_ptr comm_split(int color, int key) const + { rmm::device_uvector d_colors(get_size(), stream_); rmm::device_uvector d_keys(get_size(), stream_); update_device(d_colors.data() + get_rank(), &color, 1, stream_); update_device(d_keys.data() + get_rank(), &key, 1, stream_); - allgather(d_colors.data() + get_rank(), d_colors.data(), 1, - datatype_t::INT32, stream_); - allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, - stream_); + allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_); + allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_); this->sync_stream(stream_); std::vector h_colors(get_size()); @@ -133,9 +138,7 @@ class std_comms : public comms_iface { for (int i = 0; i < get_size(); ++i) { if (h_colors[i] == color) { subcomm_ranks.push_back(i); - if (ucp_worker_ != nullptr && subcomms_ucp_) { - new_ucx_ptrs.push_back((*ucp_eps_)[i]); - } + if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); } } } @@ -144,8 +147,7 @@ class std_comms : public comms_iface { NCCL_TRY(ncclGetUniqueId(&id)); std::vector requests(subcomm_ranks.size() - 1); for (size_t i = 1; i < subcomm_ranks.size(); ++i) { - isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, - requests.data() + (i - 1)); + isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1)); } waitall(requests.size(), requests.data()); } else { @@ -160,17 +162,22 @@ class std_comms : public comms_iface { NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key)); if (ucp_worker_ != nullptr && subcomms_ucp_) { - auto eps_sp = std::make_shared(new_ucx_ptrs.data()); - return std::unique_ptr( - new std_comms(nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, - subcomm_ranks.size(), key, stream_, subcomms_ucp_)); + auto eps_sp = std::make_shared(new_ucx_ptrs.data()); + return std::unique_ptr(new std_comms(nccl_comm, + (ucp_worker_h)ucp_worker_, + eps_sp, + subcomm_ranks.size(), + key, + stream_, + subcomms_ucp_)); } else { return std::unique_ptr( new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_)); } } - void barrier() const { + void barrier() const + { CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_)); CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_)); @@ -180,39 +187,37 @@ class std_comms : public comms_iface { "ERROR: syncStream failed. This can be caused by a failed rank_."); } - void get_request_id(request_t *req) const { + void get_request_id(request_t* req) const + { request_t req_id; if (this->free_requests_.empty()) req_id = this->next_request_id_++; else { auto it = this->free_requests_.begin(); - req_id = *it; + req_id = *it; this->free_requests_.erase(it); } *req = req_id; } - void isend(const void *buf, size_t size, int dest, int tag, - request_t *request) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); ucp_ep_h ep_ptr = (*ucp_eps_)[dest]; - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); + ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); - this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, - default_tag_mask, get_rank()); + this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank()); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void irecv(void *buf, size_t size, int source, int tag, - request_t *request) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void irecv(void* buf, size_t size, int source, int tag, request_t* request) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); get_request_id(request); @@ -220,18 +225,17 @@ class std_comms : public comms_iface { ucp_tag_t tag_mask = default_tag_mask; - ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request)); - ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, - tag_mask, source); + ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request)); + ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source); requests_in_flight_.insert(std::make_pair(*request, ucp_req)); } - void waitall(int count, request_t array_of_requests[]) const { - ASSERT(ucp_worker_ != nullptr, - "ERROR: UCX comms not initialized on communicator."); + void waitall(int count, request_t array_of_requests[]) const + { + ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator."); - std::vector requests; + std::vector requests; requests.reserve(count); time_t start = time(NULL); @@ -239,7 +243,8 @@ class std_comms : public comms_iface { for (int i = 0; i < count; ++i) { auto req_it = requests_in_flight_.find(array_of_requests[i]); ASSERT(requests_in_flight_.end() != req_it, - "ERROR: waitall on invalid request: %d", array_of_requests[i]); + "ERROR: waitall on invalid request: %d", + array_of_requests[i]); requests.push_back(req_it->second); free_requests_.insert(req_it->first); requests_in_flight_.erase(req_it); @@ -252,8 +257,7 @@ class std_comms : public comms_iface { // in 10 or more seconds. ASSERT(now - start < 10, "Timed out waiting for requests."); - for (std::vector::iterator it = requests.begin(); - it != requests.end();) { + for (std::vector::iterator it = requests.begin(); it != requests.end();) { bool restart = false; // resets the timeout when any progress was made // Causes UCP to progress through the send/recv message queue @@ -266,10 +270,8 @@ class std_comms : public comms_iface { // If the message needs release, we know it will be sent/received // asynchronously, so we will need to track and verify its state if (req->needs_release) { - ASSERT(UCS_PTR_IS_PTR(req->req), - "UCX Request Error. Request is not valid UCX pointer"); - ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", - UCS_PTR_STATUS(req->req)); + ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer"); + ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req)); ASSERT(req->req->completed == 1 || req->req->completed == 0, "request->completed not a valid value: %d\n", req->req->completed); @@ -290,101 +292,154 @@ class std_comms : public comms_iface { ++it; } // if any progress was made, reset the timeout start time - if (restart) { - start = time(NULL); - } + if (restart) { start = time(NULL); } } } } - void allreduce(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void allreduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllReduce( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream)); } - void bcast(void *buff, size_t count, datatype_t datatype, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const + { + NCCL_TRY( + ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void bcast(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, int root, cudaStream_t stream) const { - NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count, - get_nccl_datatype(datatype), root, nccl_comm_, - stream)); + void bcast(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclBroadcast( + sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream)); } - void reduce(const void *sendbuff, void *recvbuff, size_t count, - datatype_t datatype, op_t op, int root, - cudaStream_t stream) const { - NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype), - get_nccl_op(op), root, nccl_comm_, stream)); + void reduce(const void* sendbuff, + void* recvbuff, + size_t count, + datatype_t datatype, + op_t op, + int root, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduce(sendbuff, + recvbuff, + count, + get_nccl_datatype(datatype), + get_nccl_op(op), + root, + nccl_comm_, + stream)); } - void allgather(const void *sendbuff, void *recvbuff, size_t sendcount, - datatype_t datatype, cudaStream_t stream) const { - NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount, - get_nccl_datatype(datatype), nccl_comm_, stream)); + void allgather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + cudaStream_t stream) const + { + NCCL_TRY(ncclAllGather( + sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream)); } - void allgatherv(const void *sendbuf, void *recvbuf, const size_t *recvcounts, - const size_t *displs, datatype_t datatype, - cudaStream_t stream) const { - //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf - //Listing 1 on page 4. + void allgatherv(const void* sendbuf, + void* recvbuf, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + cudaStream_t stream) const + { + // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - + // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4. for (int root = 0; root < num_ranks_; ++root) { size_t dtype_size = get_datatype_size(datatype); - NCCL_TRY(ncclBroadcast( - sendbuf, static_cast(recvbuf) + displs[root] * dtype_size, - recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_, - stream)); + NCCL_TRY(ncclBroadcast(sendbuf, + static_cast(recvbuf) + displs[root] * dtype_size, + recvcounts[root], + get_nccl_datatype(datatype), + root, + nccl_comm_, + stream)); } } - void gather(const void *sendbuff, void *recvbuff, size_t sendcount, - datatype_t datatype, int root, cudaStream_t stream) const { + void gather(const void* sendbuff, + void* recvbuff, + size_t sendcount, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + sendcount * r * dtype_size, sendcount, - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + sendcount * r * dtype_size, + sendcount, + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount, - const size_t *recvcounts, const size_t *displs, - datatype_t datatype, int root, cudaStream_t stream) const { + void gatherv(const void* sendbuff, + void* recvbuff, + size_t sendcount, + const size_t* recvcounts, + const size_t* displs, + datatype_t datatype, + int root, + cudaStream_t stream) const + { size_t dtype_size = get_datatype_size(datatype); NCCL_TRY(ncclGroupStart()); if (get_rank() == root) { for (int r = 0; r < get_size(); ++r) { - NCCL_TRY(ncclRecv( - static_cast(recvbuff) + displs[r] * dtype_size, recvcounts[r], - get_nccl_datatype(datatype), r, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(static_cast(recvbuff) + displs[r] * dtype_size, + recvcounts[r], + get_nccl_datatype(datatype), + r, + nccl_comm_, + stream)); } } - NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, - nccl_comm_, stream)); + NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount, - datatype_t datatype, op_t op, cudaStream_t stream) const { - NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount, - get_nccl_datatype(datatype), get_nccl_op(op), - nccl_comm_, stream)); + void reducescatter(const void* sendbuff, + void* recvbuff, + size_t recvcount, + datatype_t datatype, + op_t op, + cudaStream_t stream) const + { + NCCL_TRY(ncclReduceScatter(sendbuff, + recvbuff, + recvcount, + get_nccl_datatype(datatype), + get_nccl_op(op), + nccl_comm_, + stream)); } - status_t sync_stream(cudaStream_t stream) const { + status_t sync_stream(cudaStream_t stream) const + { cudaError_t cudaErr; ncclResult_t ncclErr, ncclAsyncErr; while (1) { @@ -417,45 +472,58 @@ class std_comms : public comms_iface { } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_send(const void *buf, size_t size, int dest, - cudaStream_t stream) const { + void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const + { NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream)); } // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock - void device_recv(void *buf, size_t size, int source, - cudaStream_t stream) const { + void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const + { NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream)); } - void device_sendrecv(const void *sendbuf, size_t sendsize, int dest, - void *recvbuf, size_t recvsize, int source, - cudaStream_t stream) const { + void device_sendrecv(const void* sendbuf, + size_t sendsize, + int dest, + void* recvbuf, + size_t recvsize, + int source, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream)); - NCCL_TRY( - ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); + NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream)); NCCL_TRY(ncclGroupEnd()); } - void device_multicast_sendrecv(const void *sendbuf, - std::vector const &sendsizes, - std::vector const &sendoffsets, - std::vector const &dests, void *recvbuf, - std::vector const &recvsizes, - std::vector const &recvoffsets, - std::vector const &sources, - cudaStream_t stream) const { + void device_multicast_sendrecv(const void* sendbuf, + std::vector const& sendsizes, + std::vector const& sendoffsets, + std::vector const& dests, + void* recvbuf, + std::vector const& recvsizes, + std::vector const& recvoffsets, + std::vector const& sources, + cudaStream_t stream) const + { // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock NCCL_TRY(ncclGroupStart()); for (size_t i = 0; i < sendsizes.size(); ++i) { - NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], - sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream)); + NCCL_TRY(ncclSend(static_cast(sendbuf) + sendoffsets[i], + sendsizes[i], + ncclUint8, + dests[i], + nccl_comm_, + stream)); } for (size_t i = 0; i < recvsizes.size(); ++i) { - NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], - recvsizes[i], ncclUint8, sources[i], nccl_comm_, + NCCL_TRY(ncclRecv(static_cast(recvbuf) + recvoffsets[i], + recvsizes[i], + ncclUint8, + sources[i], + nccl_comm_, stream)); } NCCL_TRY(ncclGroupEnd()); @@ -475,10 +543,9 @@ class std_comms : public comms_iface { comms_ucp_handler ucp_handler_; ucp_worker_h ucp_worker_; - std::shared_ptr ucp_eps_; + std::shared_ptr ucp_eps_; mutable request_t next_request_id_; - mutable std::unordered_map - requests_in_flight_; + mutable std::unordered_map requests_in_flight_; mutable std::unordered_set free_requests_; }; } // end namespace comms diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp index 39086de25d..5f87bf41fa 100644 --- a/cpp/include/raft/comms/test.hpp +++ b/cpp/include/raft/comms/test.hpp @@ -35,24 +35,23 @@ namespace comms { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_allreduce(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_allreduce(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = 1; cudaStream_t stream = handle.get_stream(); rmm::device_scalar temp_d(stream); - CUDA_CHECK( - cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream)); communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream); int temp_h = 0; - CUDA_CHECK( - cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -67,10 +66,11 @@ bool test_collective_allreduce(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_broadcast(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_broadcast(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = root; @@ -79,14 +79,12 @@ bool test_collective_broadcast(const handle_t &handle, int root) { rmm::device_scalar temp_d(stream); if (communicator.get_rank() == root) - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.bcast(temp_d.data(), 1, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -101,10 +99,11 @@ bool test_collective_broadcast(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_reduce(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_reduce(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = root; @@ -112,14 +111,12 @@ bool test_collective_reduce(const handle_t &handle, int root) { rmm::device_scalar temp_d(stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -137,10 +134,11 @@ bool test_collective_reduce(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_allgather(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_allgather(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = communicator.get_rank(); @@ -149,16 +147,13 @@ bool test_collective_allgather(const handle_t &handle, int root) { rmm::device_scalar temp_d(stream); rmm::device_uvector recv_d(communicator.get_size(), stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.allgather(temp_d.data(), recv_d.data(), 1, stream); communicator.sync_stream(stream); - int - temp_h[communicator.get_size()]; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), - sizeof(int) * communicator.get_size(), - cudaMemcpyDeviceToHost, stream)); + int temp_h[communicator.get_size()]; // Verify more than one byte is being sent + CUDA_CHECK(cudaMemcpyAsync( + &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -176,30 +171,29 @@ bool test_collective_allgather(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_gather(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_gather(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); int const send = communicator.get_rank(); cudaStream_t stream = handle.get_stream(); rmm::device_scalar temp_d(stream); - rmm::device_uvector recv_d( - communicator.get_rank() == root ? communicator.get_size() : 0, stream); + rmm::device_uvector recv_d(communicator.get_rank() == root ? communicator.get_size() : 0, + stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), - cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(communicator.get_size(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), - sizeof(int) * temp_h.size(), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -214,45 +208,47 @@ bool test_collective_gather(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_gatherv(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_gatherv(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); std::vector sendcounts(communicator.get_size()); std::iota(sendcounts.begin(), sendcounts.end(), size_t{1}); std::vector displacements(communicator.get_size() + 1, 0); - std::partial_sum(sendcounts.begin(), sendcounts.end(), - displacements.begin() + 1); + std::partial_sum(sendcounts.begin(), sendcounts.end(), displacements.begin() + 1); - std::vector sends(displacements[communicator.get_rank() + 1] - - displacements[communicator.get_rank()], - communicator.get_rank()); + std::vector sends( + displacements[communicator.get_rank() + 1] - displacements[communicator.get_rank()], + communicator.get_rank()); cudaStream_t stream = handle.get_stream(); rmm::device_uvector temp_d(sends.size(), stream); - rmm::device_uvector recv_d( - communicator.get_rank() == root ? displacements.back() : 0, stream); + rmm::device_uvector recv_d(communicator.get_rank() == root ? displacements.back() : 0, + stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), - sends.size() * sizeof(int), cudaMemcpyHostToDevice, - stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); communicator.gatherv( - temp_d.data(), recv_d.data(), temp_d.size(), - communicator.get_rank() == root ? sendcounts.data() - : static_cast(nullptr), - communicator.get_rank() == root ? displacements.data() - : static_cast(nullptr), - root, stream); + temp_d.data(), + recv_d.data(), + temp_d.size(), + communicator.get_rank() == root ? sendcounts.data() : static_cast(nullptr), + communicator.get_rank() == root ? displacements.data() : static_cast(nullptr), + root, + stream); communicator.sync_stream(stream); if (communicator.get_rank() == root) { std::vector temp_h(displacements.back(), 0); - CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(), + CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), + recv_d.data(), sizeof(int) * displacements.back(), - cudaMemcpyDeviceToHost, stream)); + cudaMemcpyDeviceToHost, + stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); i++) { @@ -271,10 +267,11 @@ bool test_collective_gatherv(const handle_t &handle, int root) { * * @param[in] handle the raft handle to use. This is expected to already have an * initialized comms instance. -* @param[in] root the root rank id + * @param[in] root the root rank id */ -bool test_collective_reducescatter(const handle_t &handle, int root) { - comms_t const &communicator = handle.get_comms(); +bool test_collective_reducescatter(const handle_t& handle, int root) +{ + comms_t const& communicator = handle.get_comms(); std::vector sends(communicator.get_size(), 1); @@ -283,16 +280,13 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { rmm::device_uvector temp_d(sends.size(), stream); rmm::device_scalar recv_d(stream); - CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(), - sends.size() * sizeof(int), cudaMemcpyHostToDevice, - stream)); + CUDA_CHECK(cudaMemcpyAsync( + temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream)); - communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, - stream); + communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream); communicator.sync_stream(stream); int temp_h = -1; // Verify more than one byte is being sent - CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), - cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); communicator.barrier(); @@ -309,9 +303,10 @@ bool test_collective_reducescatter(const handle_t &handle, int root) { * initialized comms instance. * @param[in] numTrials number of iterations of all-to-all messaging to perform */ -bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); +bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -320,11 +315,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { std::vector requests; requests.resize(2 * (communicator.get_size() - 1)); int request_idx = 0; - //post receives + // post receives for (int r = 0; r < communicator.get_size(); ++r) { if (r != rank) { - communicator.irecv(received_data.data() + request_idx, 1, r, 0, - requests.data() + request_idx); + communicator.irecv( + received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx); ++request_idx; } } @@ -360,8 +355,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { communicator.barrier(); } - if (communicator.get_rank() == 0) - std::cout << "=========================" << std::endl; + if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl; } return ret; @@ -374,10 +368,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) { * initialized comms instance. * @param numTrials number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -400,13 +395,9 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { communicator.sync_stream(stream); - if (!sender && received_data.value(stream) != rank - 1) { - ret = false; - } + if (!sender && received_data.value(stream) != rank - 1) { ret = false; } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -419,10 +410,11 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) { * initialized comms instance. * @param numTrials number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -436,12 +428,12 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { if (rank % 2 == 0) { if (rank + 1 < communicator.get_size()) { - communicator.device_sendrecv(sent_data.data(), 1, rank + 1, - received_data.data(), 1, rank + 1, stream); + communicator.device_sendrecv( + sent_data.data(), 1, rank + 1, received_data.data(), 1, rank + 1, stream); } } else { - communicator.device_sendrecv(sent_data.data(), 1, rank - 1, - received_data.data(), 1, rank - 1, stream); + communicator.device_sendrecv( + sent_data.data(), 1, rank - 1, received_data.data(), 1, rank - 1, stream); } communicator.sync_stream(stream); @@ -451,9 +443,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { ret = false; } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -466,11 +456,11 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) { * initialized comms instance. * @param numTrials number of iterations of send or receive messaging to perform */ -bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, - int numTrials) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - cudaStream_t stream = h.get_stream(); +bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + cudaStream_t stream = h.get_stream(); bool ret = true; for (int i = 0; i < numTrials; i++) { @@ -493,25 +483,26 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, std::vector srcs(communicator.get_size()); std::iota(srcs.begin(), srcs.end(), int{0}); - communicator.device_multicast_sendrecv( - sent_data.data(), sendsizes, sendoffsets, dests, received_data.data(), - recvsizes, recvoffsets, srcs, stream); + communicator.device_multicast_sendrecv(sent_data.data(), + sendsizes, + sendoffsets, + dests, + received_data.data(), + recvsizes, + recvoffsets, + srcs, + stream); communicator.sync_stream(stream); std::vector h_received_data(communicator.get_size()); - raft::update_host(h_received_data.data(), received_data.data(), - received_data.size(), stream); + raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream); CUDA_TRY(cudaStreamSynchronize(stream)); for (int i = 0; i < communicator.get_size(); ++i) { - if (h_received_data[i] != i) { - ret = false; - } + if (h_received_data[i] != i) { ret = false; } } - if (communicator.get_rank() == 0) { - std::cout << "=========================" << std::endl; - } + if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; } } return ret; @@ -524,20 +515,20 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h, * initialized comms instance. * @param n_colors number of different colors to test */ -bool test_commsplit(const handle_t &h, int n_colors) { - comms_t const &communicator = h.get_comms(); - int const rank = communicator.get_rank(); - int const size = communicator.get_size(); +bool test_commsplit(const handle_t& h, int n_colors) +{ + comms_t const& communicator = h.get_comms(); + int const rank = communicator.get_rank(); + int const size = communicator.get_size(); if (n_colors > size) n_colors = size; // first we need to assign to a color, then assign the rank within the color int color = rank % n_colors; - int key = rank / n_colors; + int key = rank / n_colors; handle_t new_handle(1); - auto shared_comm = - std::make_shared(communicator.comm_split(color, key)); + auto shared_comm = std::make_shared(communicator.comm_split(color, key)); new_handle.set_comms(shared_comm); return test_collective_allreduce(new_handle, 0); diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp index 226b6f0527..89c7b25630 100644 --- a/cpp/include/raft/comms/ucp_helper.hpp +++ b/cpp/include/raft/comms/ucp_helper.hpp @@ -25,16 +25,19 @@ namespace raft { namespace comms { -typedef void (*dlsym_print_info)(ucp_ep_h, FILE *); -typedef void (*dlsym_rec_free)(void *); +typedef void (*dlsym_print_info)(ucp_ep_h, FILE*); +typedef void (*dlsym_rec_free)(void*); typedef int (*dlsym_worker_progress)(ucp_worker_h); -typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t, - ucp_datatype_t, ucp_tag_t, - ucp_send_callback_t); -typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count, - ucp_datatype_t datatype, ucp_tag_t, - ucp_tag_t, ucp_tag_recv_callback_t); +typedef ucs_status_ptr_t (*dlsym_send)( + ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t); +typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, + void*, + size_t count, + ucp_datatype_t datatype, + ucp_tag_t, + ucp_tag_t, + ucp_tag_recv_callback_t); /** * Standard UCX request object that will be passed @@ -55,9 +58,9 @@ struct ucx_context { */ class ucp_request { public: - struct ucx_context *req; - bool needs_release = true; - int other_rank = -1; + struct ucx_context* req; + bool needs_release = true; + int other_rank = -1; bool is_send_request = false; }; @@ -67,18 +70,19 @@ static const ucp_tag_t default_tag_mask = -1; /** * @brief Asynchronous send callback sets request to completed */ -static void send_callback(void *request, ucs_status_t status) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; +static void send_callback(void* request, ucs_status_t status) +{ + struct ucx_context* context = (struct ucx_context*)request; + context->completed = 1; } /** * @brief Asynchronous recv callback sets request to completed */ -static void recv_callback(void *request, ucs_status_t status, - ucp_tag_recv_info_t *info) { - struct ucx_context *context = (struct ucx_context *)request; - context->completed = 1; +static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_t* info) +{ + struct ucx_context* context = (struct ucx_context*)request; + context->completed = 1; } /** @@ -87,7 +91,8 @@ static void recv_callback(void *request, ucs_status_t status, */ class comms_ucp_handler { public: - comms_ucp_handler() { + comms_ucp_handler() + { load_ucp_handle(); load_send_func(); load_recv_func(); @@ -99,7 +104,7 @@ class comms_ucp_handler { ~comms_ucp_handler() { dlclose(ucp_handle); } private: - void *ucp_handle; + void* ucp_handle; dlsym_print_info print_info_func; dlsym_rec_free req_free_func; @@ -107,7 +112,8 @@ class comms_ucp_handler { dlsym_send send_func; dlsym_recv recv_func; - void load_ucp_handle() { + void load_ucp_handle() + { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE); if (!ucp_handle) { ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE); @@ -117,51 +123,56 @@ class comms_ucp_handler { dlerror(); } - void assert_dlerror() { - char *error = dlerror(); + void assert_dlerror() + { + char* error = dlerror(); ASSERT(error == NULL, "Error loading function symbol: %s\n", error); } - void load_send_func() { + void load_send_func() + { send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb"); assert_dlerror(); } - void load_free_req_func() { + void load_free_req_func() + { req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free"); assert_dlerror(); } - void load_print_info_func() { + void load_print_info_func() + { print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info"); assert_dlerror(); } - void load_worker_progress_func() { - worker_progress_func = - (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); + void load_worker_progress_func() + { + worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress"); assert_dlerror(); } - void load_recv_func() { + void load_recv_func() + { recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb"); assert_dlerror(); } - ucp_tag_t build_message_tag(int rank, int tag) const { + ucp_tag_t build_message_tag(int rank, int tag) const + { // keeping the rank in the lower bits enables debugging. return ((uint32_t)tag << 31) | (uint32_t)rank; } public: - int ucp_progress(ucp_worker_h worker) const { - return (*(worker_progress_func))(worker); - } + int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); } /** * @brief Frees any memory underlying the given ucp request object */ - void free_ucp_request(ucp_request *request) const { + void free_ucp_request(ucp_request* request) const + { if (request->needs_release) { request->req->completed = 0; (*(req_free_func))(request->req); @@ -172,56 +183,67 @@ class comms_ucp_handler { /** * @brief Asynchronously send data to the given endpoint using the given tag */ - void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf, - size_t size, int tag, ucp_tag_t tag_mask, int rank) const { + void ucp_isend(ucp_request* req, + ucp_ep_h ep_ptr, + const void* buf, + size_t size, + int tag, + ucp_tag_t tag_mask, + int rank) const + { ucp_tag_t ucp_tag = build_message_tag(rank, tag); - ucs_status_ptr_t send_result = (*(send_func))( - ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); - struct ucx_context *ucp_req = (struct ucx_context *)send_result; + ucs_status_ptr_t send_result = + (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback); + struct ucx_context* ucp_req = (struct ucx_context*)send_result; if (UCS_PTR_IS_ERR(send_result)) { ASSERT(!UCS_PTR_IS_ERR(send_result), "unable to send UCX data message (%d)\n", UCS_PTR_STATUS(send_result)); /** - * If the request didn't fail, but it's not OK, it is in flight. - * Expect the handler to be invoked - */ + * If the request didn't fail, but it's not OK, it is in flight. + * Expect the handler to be invoked + */ } else if (UCS_PTR_STATUS(send_result) != UCS_OK) { /** - * If the request is OK, it's already been completed and we don't need to wait on it. - * The request will be a nullptr, however, so we need to create a new request - * and set it to completed to make the "waitall()" function work properly. - */ + * If the request is OK, it's already been completed and we don't need to wait on it. + * The request will be a nullptr, however, so we need to create a new request + * and set it to completed to make the "waitall()" function work properly. + */ req->needs_release = true; } else { req->needs_release = false; } - req->other_rank = rank; + req->other_rank = rank; req->is_send_request = true; - req->req = ucp_req; + req->req = ucp_req; } /** * @brief Asynchronously receive data from given endpoint with the given tag. */ - void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr, - void *buf, size_t size, int tag, ucp_tag_t tag_mask, - int sender_rank) const { + void ucp_irecv(ucp_request* req, + ucp_worker_h worker, + ucp_ep_h ep_ptr, + void* buf, + size_t size, + int tag, + ucp_tag_t tag_mask, + int sender_rank) const + { ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag); ucs_status_ptr_t recv_result = - (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, - tag_mask, recv_callback); + (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback); - struct ucx_context *ucp_req = (struct ucx_context *)recv_result; + struct ucx_context* ucp_req = (struct ucx_context*)recv_result; - req->req = ucp_req; - req->needs_release = true; + req->req = ucp_req; + req->needs_release = true; req->is_send_request = false; - req->other_rank = sender_rank; + req->other_rank = sender_rank; ASSERT(!UCS_PTR_IS_ERR(recv_result), "unable to receive UCX data message (%d)\n", diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp index f3216abc37..1b0548fc00 100644 --- a/cpp/include/raft/comms/util.hpp +++ b/cpp/include/raft/comms/util.hpp @@ -26,88 +26,70 @@ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an * exception detailing the NCCL error that occurred */ -#define NCCL_TRY(call) \ - do { \ - ncclResult_t const status = (call); \ - if (ncclSuccess != status) { \ - std::string msg{}; \ - SET_ERROR_MSG(msg, \ - "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, ncclGetErrorString(status)); \ - throw raft::logic_error(msg); \ - } \ +#define NCCL_TRY(call) \ + do { \ + ncclResult_t const status = (call); \ + if (ncclSuccess != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "NCCL error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ + ncclGetErrorString(status)); \ + throw raft::logic_error(msg); \ + } \ } while (0); -#define NCCL_TRY_NO_THROW(call) \ - do { \ - ncclResult_t status = call; \ - if (ncclSuccess != status) { \ - printf("NCCL call='%s' failed. Reason:%s\n", #call, \ - ncclGetErrorString(status)); \ - } \ +#define NCCL_TRY_NO_THROW(call) \ + do { \ + ncclResult_t status = call; \ + if (ncclSuccess != status) { \ + printf("NCCL call='%s' failed. Reason:%s\n", #call, ncclGetErrorString(status)); \ + } \ } while (0) namespace raft { namespace comms { -constexpr size_t get_datatype_size(const datatype_t datatype) { +constexpr size_t get_datatype_size(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return sizeof(char); - case datatype_t::UINT8: - return sizeof(uint8_t); - case datatype_t::INT32: - return sizeof(int); - case datatype_t::UINT32: - return sizeof(unsigned int); - case datatype_t::INT64: - return sizeof(int64_t); - case datatype_t::UINT64: - return sizeof(uint64_t); - case datatype_t::FLOAT32: - return sizeof(float); - case datatype_t::FLOAT64: - return sizeof(double); - default: - throw "Unsupported datatype"; + case datatype_t::CHAR: return sizeof(char); + case datatype_t::UINT8: return sizeof(uint8_t); + case datatype_t::INT32: return sizeof(int); + case datatype_t::UINT32: return sizeof(unsigned int); + case datatype_t::INT64: return sizeof(int64_t); + case datatype_t::UINT64: return sizeof(uint64_t); + case datatype_t::FLOAT32: return sizeof(float); + case datatype_t::FLOAT64: return sizeof(double); + default: throw "Unsupported datatype"; } } -constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) { +constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) +{ switch (datatype) { - case datatype_t::CHAR: - return ncclChar; - case datatype_t::UINT8: - return ncclUint8; - case datatype_t::INT32: - return ncclInt; - case datatype_t::UINT32: - return ncclUint32; - case datatype_t::INT64: - return ncclInt64; - case datatype_t::UINT64: - return ncclUint64; - case datatype_t::FLOAT32: - return ncclFloat; - case datatype_t::FLOAT64: - return ncclDouble; - default: - throw "Unsupported datatype"; + case datatype_t::CHAR: return ncclChar; + case datatype_t::UINT8: return ncclUint8; + case datatype_t::INT32: return ncclInt; + case datatype_t::UINT32: return ncclUint32; + case datatype_t::INT64: return ncclInt64; + case datatype_t::UINT64: return ncclUint64; + case datatype_t::FLOAT32: return ncclFloat; + case datatype_t::FLOAT64: return ncclDouble; + default: throw "Unsupported datatype"; } } -constexpr ncclRedOp_t get_nccl_op(const op_t op) { +constexpr ncclRedOp_t get_nccl_op(const op_t op) +{ switch (op) { - case op_t::SUM: - return ncclSum; - case op_t::PROD: - return ncclProd; - case op_t::MIN: - return ncclMin; - case op_t::MAX: - return ncclMax; - default: - throw "Unsupported datatype"; + case op_t::SUM: return ncclSum; + case op_t::PROD: return ncclProd; + case op_t::MIN: return ncclMin; + case op_t::MAX: return ncclMax; + default: throw "Unsupported datatype"; } } }; // namespace comms diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh index 14274043f5..8a66eff242 100644 --- a/cpp/include/raft/cuda_utils.cuh +++ b/cpp/include/raft/cuda_utils.cuh @@ -36,16 +36,17 @@ namespace raft { /** helper macro for device inlined functions */ -#define DI inline __device__ +#define DI inline __device__ #define HDI inline __host__ __device__ -#define HD __host__ __device__ +#define HD __host__ __device__ /** * @brief Provide a ceiling division operation ie. ceil(a / b) * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType ceildiv(IntType a, IntType b) { +constexpr HDI IntType ceildiv(IntType a, IntType b) +{ return (a + b - 1) / b; } @@ -54,7 +55,8 @@ constexpr HDI IntType ceildiv(IntType a, IntType b) { * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignTo(IntType a, IntType b) { +constexpr HDI IntType alignTo(IntType a, IntType b) +{ return ceildiv(a, b) * b; } @@ -63,7 +65,8 @@ constexpr HDI IntType alignTo(IntType a, IntType b) { * @tparam IntType supposed to be only integers for now! */ template -constexpr HDI IntType alignDown(IntType a, IntType b) { +constexpr HDI IntType alignDown(IntType a, IntType b) +{ return (a / b) * b; } @@ -72,7 +75,8 @@ constexpr HDI IntType alignDown(IntType a, IntType b) { * @tparam IntType data type (checked only for integers) */ template -constexpr HDI bool isPo2(IntType num) { +constexpr HDI bool isPo2(IntType num) +{ return (num && !(num & (num - 1))); } @@ -81,14 +85,16 @@ constexpr HDI bool isPo2(IntType num) { * @tparam IntType data type (checked only for integers) */ template -constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) { +constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) +{ return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret); } /** Device function to apply the input lambda across threads in the grid */ template -DI void forEach(int num, L lambda) { - int idx = (blockDim.x * blockIdx.x) + threadIdx.x; +DI void forEach(int num, L lambda) +{ + int idx = (blockDim.x * blockIdx.x) + threadIdx.x; const int numThreads = blockDim.x * gridDim.x; #pragma unroll for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) { @@ -100,7 +106,8 @@ DI void forEach(int num, L lambda) { static const int WarpSize = 32; /** get the laneId of the current thread */ -DI int laneId() { +DI int laneId() +{ int id; asm("mov.s32 %0, %laneid;" : "=r"(id)); return id; @@ -113,15 +120,17 @@ DI int laneId() { * @param b second input */ template -HDI void swapVals(T &a, T &b) { +HDI void swapVals(T& a, T& b) +{ T tmp = a; - a = b; - b = tmp; + a = b; + b = tmp; } /** Device function to have atomic add support for older archs */ template -DI void myAtomicAdd(Type *address, Type val) { +DI void myAtomicAdd(Type* address, Type val) +{ atomicAdd(address, val); } @@ -129,105 +138,114 @@ DI void myAtomicAdd(Type *address, Type val) { // Ref: // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf template <> -DI void myAtomicAdd(double *address, double val) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicAdd(double* address, double val) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); + old = + atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); } #endif template -DI void myAtomicReduce(T *address, T val, ReduceLambda op); +DI void myAtomicReduce(T* address, T val, ReduceLambda op); template -DI void myAtomicReduce(double *address, double val, ReduceLambda op) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; +DI void myAtomicReduce(double* address, double val, ReduceLambda op) +{ + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; do { assumed = old; - old = - atomicCAS(address_as_ull, assumed, - __double_as_longlong(op(val, __longlong_as_double(assumed)))); + old = atomicCAS( + address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(float *address, float val, ReduceLambda op) { - unsigned int *address_as_uint = (unsigned int *)address; - unsigned int old = *address_as_uint, assumed; +DI void myAtomicReduce(float* address, float val, ReduceLambda op) +{ + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint, assumed; do { assumed = old; - old = atomicCAS(address_as_uint, assumed, - __float_as_uint(op(val, __uint_as_float(assumed)))); + old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed)))); } while (assumed != old); } template -DI void myAtomicReduce(int *address, int val, ReduceLambda op) { +DI void myAtomicReduce(int* address, int val, ReduceLambda op) +{ int old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) { +DI void myAtomicReduce(long long* address, long long val, ReduceLambda op) +{ long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } template -DI void myAtomicReduce(unsigned long long *address, unsigned long long val, - ReduceLambda op) { +DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op) +{ unsigned long long old = *address, assumed; do { assumed = old; - old = atomicCAS(address, assumed, op(val, assumed)); + old = atomicCAS(address, assumed, op(val, assumed)); } while (assumed != old); } /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T *address, T val); +DI T myAtomicMin(T* address, T val); /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T *address, T val); +DI T myAtomicMax(T* address, T val); -DI float myAtomicMin(float *address, float val) { +DI float myAtomicMin(float* address, float val) +{ myAtomicReduce(address, val, fminf); return *address; } -DI float myAtomicMax(float *address, float val) { +DI float myAtomicMax(float* address, float val) +{ myAtomicReduce(address, val, fmaxf); return *address; } -DI double myAtomicMin(double *address, double val) { +DI double myAtomicMin(double* address, double val) +{ myAtomicReduce(address, val, fmin); return *address; } -DI double myAtomicMax(double *address, double val) { +DI double myAtomicMax(double* address, double val) +{ myAtomicReduce(address, val, fmax); return *address; } @@ -239,11 +257,13 @@ DI double myAtomicMax(double *address, double val) { template HDI T myMax(T x, T y); template <> -HDI float myMax(float x, float y) { +HDI float myMax(float x, float y) +{ return fmaxf(x, y); } template <> -HDI double myMax(double x, double y) { +HDI double myMax(double x, double y) +{ return fmax(x, y); } /** @} */ @@ -255,11 +275,13 @@ HDI double myMax(double x, double y) { template HDI T myMin(T x, T y); template <> -HDI float myMin(float x, float y) { +HDI float myMin(float x, float y) +{ return fminf(x, y); } template <> -HDI double myMin(double x, double y) { +HDI double myMin(double x, double y) +{ return fmin(x, y); } /** @} */ @@ -267,11 +289,13 @@ HDI double myMin(double x, double y) { /** * @brief Provide atomic min operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ min(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMin(T *address, T val) { +DI T myAtomicMin(T* address, T val) +{ myAtomicReduce(address, val, myMin); return *address; } @@ -279,11 +303,13 @@ DI T myAtomicMin(T *address, T val) { /** * @brief Provide atomic max operation. * @tparam T: data type for input data (float or double). - * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val) + * @param[in] address: address to read old value from, and to atomically update w/ max(old value, + * val) * @param[in] val: new value to compare with old */ template -DI T myAtomicMax(T *address, T val) { +DI T myAtomicMax(T* address, T val) +{ myAtomicReduce(address, val, myMax); return *address; } @@ -292,7 +318,8 @@ DI T myAtomicMax(T *address, T val) { * Sign function */ template -HDI int sgn(const T val) { +HDI int sgn(const T val) +{ return (T(0) < val) - (val < T(0)); } @@ -303,11 +330,13 @@ HDI int sgn(const T val) { template HDI T myExp(T x); template <> -HDI float myExp(float x) { +HDI float myExp(float x) +{ return expf(x); } template <> -HDI double myExp(double x) { +HDI double myExp(double x) +{ return exp(x); } /** @} */ @@ -319,11 +348,13 @@ HDI double myExp(double x) { template inline __device__ T myInf(); template <> -inline __device__ float myInf() { +inline __device__ float myInf() +{ return CUDART_INF_F; } template <> -inline __device__ double myInf() { +inline __device__ double myInf() +{ return CUDART_INF; } /** @} */ @@ -335,11 +366,13 @@ inline __device__ double myInf() { template HDI T myLog(T x); template <> -HDI float myLog(float x) { +HDI float myLog(float x) +{ return logf(x); } template <> -HDI double myLog(double x) { +HDI double myLog(double x) +{ return log(x); } /** @} */ @@ -351,11 +384,13 @@ HDI double myLog(double x) { template HDI T mySqrt(T x); template <> -HDI float mySqrt(float x) { +HDI float mySqrt(float x) +{ return sqrtf(x); } template <> -HDI double mySqrt(double x) { +HDI double mySqrt(double x) +{ return sqrt(x); } /** @} */ @@ -365,13 +400,15 @@ HDI double mySqrt(double x) { * @{ */ template -DI void mySinCos(T x, T &s, T &c); +DI void mySinCos(T x, T& s, T& c); template <> -DI void mySinCos(float x, float &s, float &c) { +DI void mySinCos(float x, float& s, float& c) +{ sincosf(x, &s, &c); } template <> -DI void mySinCos(double x, double &s, double &c) { +DI void mySinCos(double x, double& s, double& c) +{ sincos(x, &s, &c); } /** @} */ @@ -383,11 +420,13 @@ DI void mySinCos(double x, double &s, double &c) { template DI T mySin(T x); template <> -DI float mySin(float x) { +DI float mySin(float x) +{ return sinf(x); } template <> -DI double mySin(double x) { +DI double mySin(double x) +{ return sin(x); } /** @} */ @@ -397,15 +436,18 @@ DI double mySin(double x) { * @{ */ template -DI T myAbs(T x) { +DI T myAbs(T x) +{ return x < 0 ? -x : x; } template <> -DI float myAbs(float x) { +DI float myAbs(float x) +{ return fabsf(x); } template <> -DI double myAbs(double x) { +DI double myAbs(double x) +{ return fabs(x); } /** @} */ @@ -417,11 +459,13 @@ DI double myAbs(double x) { template HDI T myPow(T x, T power); template <> -HDI float myPow(float x, float power) { +HDI float myPow(float x, float power) +{ return powf(x, power); } template <> -HDI double myPow(double x, double power) { +HDI double myPow(double x, double power) +{ return pow(x, power); } /** @} */ @@ -433,11 +477,13 @@ HDI double myPow(double x, double power) { template HDI T myTanh(T x); template <> -HDI float myTanh(float x) { +HDI float myTanh(float x) +{ return tanhf(x); } template <> -HDI double myTanh(double x) { +HDI double myTanh(double x) +{ return tanh(x); } /** @} */ @@ -449,11 +495,13 @@ HDI double myTanh(double x) { template HDI T myATanh(T x); template <> -HDI float myATanh(float x) { +HDI float myATanh(float x) +{ return atanhf(x); } template <> -HDI double myATanh(double x) { +HDI double myATanh(double x) +{ return atanh(x); } /** @} */ @@ -492,15 +540,18 @@ struct Sum { * @{ */ template -DI T signPrim(T x) { +DI T signPrim(T x) +{ return x < 0 ? -1 : +1; } template <> -DI float signPrim(float x) { +DI float signPrim(float x) +{ return signbit(x) == true ? -1.0f : +1.0f; } template <> -DI double signPrim(double x) { +DI double signPrim(double x) +{ return signbit(x) == true ? -1.0 : +1.0; } /** @} */ @@ -514,28 +565,33 @@ DI double signPrim(double x) { * @{ */ template -DI T maxPrim(T x, T y) { +DI T maxPrim(T x, T y) +{ return x > y ? x : y; } template <> -DI float maxPrim(float x, float y) { +DI float maxPrim(float x, float y) +{ return fmaxf(x, y); } template <> -DI double maxPrim(double x, double y) { +DI double maxPrim(double x, double y) +{ return fmax(x, y); } /** @} */ /** apply a warp-wide fence (useful from Volta+ archs) */ -DI void warpFence() { +DI void warpFence() +{ #if __CUDA_ARCH__ >= 700 __syncwarp(); #endif } /** warp-wide any boolean aggregator */ -DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { +DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 inFlag = __any_sync(mask, inFlag); #else @@ -545,7 +601,8 @@ DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) { } /** warp-wide all boolean aggregator */ -DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { +DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 inFlag = __all_sync(mask, inFlag); #else @@ -564,8 +621,8 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) { * @return the shuffled data */ template -DI T shfl(T val, int srcLane, int width = WarpSize, - uint32_t mask = 0xffffffffu) { +DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 return __shfl_sync(mask, val, srcLane, width); #else @@ -583,8 +640,8 @@ DI T shfl(T val, int srcLane, int width = WarpSize, * @return the shuffled data */ template -DI T shfl_xor(T val, int laneMask, int width = WarpSize, - uint32_t mask = 0xffffffffu) { +DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu) +{ #if CUDART_VERSION >= 9000 return __shfl_xor_sync(mask, val, laneMask, width); #else @@ -602,7 +659,8 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, * @todo Expand this to support arbitrary reduction ops */ template -DI T warpReduce(T val) { +DI T warpReduce(T val) +{ #pragma unroll for (int i = WarpSize / 2; i > 0; i >>= 1) { T tmp = shfl(val, laneId() + i); @@ -623,12 +681,13 @@ DI T warpReduce(T val) { * @todo Expand this to support arbitrary reduction ops */ template -DI T blockReduce(T val, char *smem) { - auto *sTemp = reinterpret_cast(smem); - int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; - int lid = laneId(); - int wid = threadIdx.x / WarpSize; - val = warpReduce(val); +DI T blockReduce(T val, char* smem) +{ + auto* sTemp = reinterpret_cast(smem); + int nWarps = (blockDim.x + WarpSize - 1) / WarpSize; + int lid = laneId(); + int wid = threadIdx.x / WarpSize; + val = warpReduce(val); if (lid == 0) sTemp[wid] = val; __syncthreads(); val = lid < nWarps ? sTemp[lid] : T(0); @@ -644,8 +703,10 @@ DI T blockReduce(T val, char *smem) { * @param idx the index for which to query the stream */ inline cudaStream_t select_stream(cudaStream_t user_stream, - cudaStream_t *int_streams, int n_int_streams, - int idx) { + cudaStream_t* int_streams, + int n_int_streams, + int idx) +{ return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream; } diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 486103dedb..cf06416a96 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -54,17 +54,20 @@ struct cuda_error : public raft::exception { * */ #ifndef CUDA_TRY -#define CUDA_TRY(call) \ - do { \ - cudaError_t const status = call; \ - if (status != cudaSuccess) { \ - cudaGetLastError(); \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \ - cudaGetErrorName(status), cudaGetErrorString(status)); \ - throw raft::cuda_error(msg); \ - } \ +#define CUDA_TRY(call) \ + do { \ + cudaError_t const status = call; \ + if (status != cudaSuccess) { \ + cudaGetLastError(); \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "CUDA error encountered at: ", \ + "call='%s', Reason=%s:%s", \ + #call, \ + cudaGetErrorName(status), \ + cudaGetErrorString(status)); \ + throw raft::cuda_error(msg); \ + } \ } while (0) #endif /** @@ -97,13 +100,16 @@ struct cuda_error : public raft::exception { // * exception. // */ #ifndef CUDA_CHECK_NO_THROW -#define CUDA_CHECK_NO_THROW(call) \ - do { \ - cudaError_t const status = call; \ - if (cudaSuccess != status) { \ - printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \ - __FILE__, __LINE__, cudaGetErrorString(status)); \ - } \ +#define CUDA_CHECK_NO_THROW(call) \ + do { \ + cudaError_t const status = call; \ + if (cudaSuccess != status) { \ + printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \ + #call, \ + __FILE__, \ + __LINE__, \ + cudaGetErrorString(status)); \ + } \ } while (0) #endif @@ -112,7 +118,7 @@ struct cuda_error : public raft::exception { * TODO: Rename original implementations in 22.04 to fix * https://github.com/rapidsai/raft/issues/128 */ -#define RAFT_CUDA_CHECK(call) CUDA_CHECK(call) +#define RAFT_CUDA_CHECK(call) CUDA_CHECK(call) #define RAFT_CUDA_CHECK_NO_THROW(call) CUDA_CHECK_NO_THROW(call) namespace raft { @@ -120,9 +126,7 @@ namespace raft { /** Helper method to get to know warp size in device code */ __host__ __device__ constexpr inline int warp_size() { return 32; } -__host__ __device__ constexpr inline unsigned int warp_full_mask() { - return 0xffffffff; -} +__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; } /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping @@ -134,20 +138,23 @@ class grid_1d_thread_t { int const num_blocks{0}; /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - * @param elements_per_thread Typically, a single kernel thread processes more than a single - * element; this affects the number of threads the grid must contain - */ - grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d, size_t elements_per_thread = 1) + * @param overall_num_elements The number of elements the kernel needs to handle/process + * @param num_threads_per_block The grid block size, determined according to the kernel's + * specific features (amount of shared memory necessary, SM functional units use pattern etc.); + * this can't be determined generically/automatically (as opposed to the number of blocks) + * @param elements_per_thread Typically, a single kernel thread processes more than a single + * element; this affects the number of threads the grid must contain + */ + grid_1d_thread_t(size_t overall_num_elements, + size_t num_threads_per_block, + size_t max_num_blocks_1d, + size_t elements_per_thread = 1) : block_size(num_threads_per_block), - num_blocks(std::min((overall_num_elements + - (elements_per_thread * num_threads_per_block) - 1) / - (elements_per_thread * num_threads_per_block), - max_num_blocks_1d)) { + num_blocks( + std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) / + (elements_per_thread * num_threads_per_block), + max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -165,18 +172,19 @@ class grid_1d_warp_t { int const num_blocks{0}; /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - */ - grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block, + * @param overall_num_elements The number of elements the kernel needs to handle/process + * @param num_threads_per_block The grid block size, determined according to the kernel's + * specific features (amount of shared memory necessary, SM functional units use pattern etc.); + * this can't be determined generically/automatically (as opposed to the number of blocks) + */ + grid_1d_warp_t(size_t overall_num_elements, + size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min( - (overall_num_elements + (num_threads_per_block / warp_size()) - 1) / - (num_threads_per_block / warp_size()), - max_num_blocks_1d)) { + num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) / + (num_threads_per_block / warp_size()), + max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -193,15 +201,17 @@ class grid_1d_block_t { int const num_blocks{0}; /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - */ - grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block, + * @param overall_num_elements The number of elements the kernel needs to handle/process + * @param num_threads_per_block The grid block size, determined according to the kernel's + * specific features (amount of shared memory necessary, SM functional units use pattern etc.); + * this can't be determined generically/automatically (as opposed to the number of blocks) + */ + grid_1d_block_t(size_t overall_num_elements, + size_t num_threads_per_block, size_t max_num_blocks_1d) : block_size(num_threads_per_block), - num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) { + num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) + { RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, "num_threads_per_block / warp_size() must be > 0"); @@ -217,10 +227,9 @@ class grid_1d_block_t { * @param stream cuda stream */ template -void copy(Type* dst, const Type* src, size_t len, - rmm::cuda_stream_view stream) { - CUDA_CHECK( - cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); +void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream) +{ + CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); } /** @@ -231,23 +240,22 @@ void copy(Type* dst, const Type* src, size_t len, */ /** performs a host to device copy */ template -void update_device(Type* d_ptr, const Type* h_ptr, size_t len, - rmm::cuda_stream_view stream) { +void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream) +{ copy(d_ptr, h_ptr, len, stream); } /** performs a device to host copy */ template -void update_host(Type* h_ptr, const Type* d_ptr, size_t len, - rmm::cuda_stream_view stream) { +void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream) +{ copy(h_ptr, d_ptr, len, stream); } template -void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, - rmm::cuda_stream_view stream) { - CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), - cudaMemcpyDeviceToDevice, stream)); +void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream) +{ + CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream)); } /** @} */ @@ -256,8 +264,11 @@ void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, * @{ */ template -void print_host_vector(const char* variable_name, const T* host_mem, - size_t componentsCount, OutStream& out) { +void print_host_vector(const char* variable_name, + const T* host_mem, + size_t componentsCount, + OutStream& out) +{ out << variable_name << "=["; for (size_t i = 0; i < componentsCount; ++i) { if (i != 0) out << ","; @@ -267,11 +278,13 @@ void print_host_vector(const char* variable_name, const T* host_mem, } template -void print_device_vector(const char* variable_name, const T* devMem, - size_t componentsCount, OutStream& out) { +void print_device_vector(const char* variable_name, + const T* devMem, + size_t componentsCount, + OutStream& out) +{ T* host_mem = new T[componentsCount]; - CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), - cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost)); print_host_vector(variable_name, host_mem, componentsCount, out); delete[] host_mem; } @@ -281,10 +294,10 @@ static std::mutex mutex_; static std::unordered_map allocations; template -void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, - bool setZero = false) { +void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false) +{ size_t size = len * sizeof(Type); - ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); std::lock_guard _(mutex_); @@ -292,17 +305,19 @@ void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, } template -void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { +void deallocate(Type*& ptr, rmm::cuda_stream_view stream) +{ std::lock_guard _(mutex_); size_t size = allocations[ptr]; allocations.erase(ptr); rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); } -inline void deallocate_all(rmm::cuda_stream_view stream) { +inline void deallocate_all(rmm::cuda_stream_view stream) +{ std::lock_guard _(mutex_); for (auto& alloc : allocations) { - void* ptr = alloc.first; + void* ptr = alloc.first; size_t size = alloc.second; rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); } @@ -310,29 +325,29 @@ inline void deallocate_all(rmm::cuda_stream_view stream) { } /** helper method to get max usable shared mem per block parameter */ -inline int getSharedMemPerBlock() { +inline int getSharedMemPerBlock() +{ int devId; CUDA_CHECK(cudaGetDevice(&devId)); int smemPerBlk; - CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, - cudaDevAttrMaxSharedMemoryPerBlock, devId)); + CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId)); return smemPerBlk; } /** helper method to get multi-processor count parameter */ -inline int getMultiProcessorCount() { +inline int getMultiProcessorCount() +{ int devId; CUDA_CHECK(cudaGetDevice(&devId)); int mpCount; - CUDA_CHECK( - cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); + CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); return mpCount; } /** helper method to convert an array on device to a string on host */ template -std::string arr2Str(const T* arr, int size, std::string name, - cudaStream_t stream, int width = 4) { +std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4) +{ std::stringstream ss; T* arr_h = (T*)malloc(size * sizeof(T)); @@ -354,53 +369,54 @@ std::string arr2Str(const T* arr, int size, std::string name, /** this seems to be unused, but may be useful in the future */ template -void ASSERT_DEVICE_MEM(T* ptr, std::string name) { +void ASSERT_DEVICE_MEM(T* ptr, std::string name) +{ cudaPointerAttributes s_att; cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); if (s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in " << name - << ". device=" << s_att.device << ", err=" << s_err << std::endl; + std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device + << ", err=" << s_err << std::endl; } -inline uint32_t curTimeMillis() { - auto now = std::chrono::high_resolution_clock::now(); +inline uint32_t curTimeMillis() +{ + auto now = std::chrono::high_resolution_clock::now(); auto duration = now.time_since_epoch(); - return std::chrono::duration_cast(duration) - .count(); + return std::chrono::duration_cast(duration).count(); } /** Helper function to calculate need memory for allocate to store dense matrix. - * @param rows number of rows in matrix - * @param columns number of columns in matrix - * @return need number of items to allocate via allocate() - * @sa allocate() - */ -inline size_t allocLengthForMatrix(size_t rows, size_t columns) { - return rows * columns; -} + * @param rows number of rows in matrix + * @param columns number of columns in matrix + * @return need number of items to allocate via allocate() + * @sa allocate() + */ +inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; } /** Helper function to check alignment of pointer. - * @param ptr the pointer to check - * @param alignment to be checked for - * @return true if address in bytes is a multiple of alignment - */ + * @param ptr the pointer to check + * @param alignment to be checked for + * @return true if address in bytes is a multiple of alignment + */ template -bool is_aligned(Type* ptr, size_t alignment) { +bool is_aligned(Type* ptr, size_t alignment) +{ return reinterpret_cast(ptr) % alignment == 0; } /** calculate greatest common divisor of two numbers -* @a integer -* @b integer -* @ return gcd of a and b -*/ + * @a integer + * @b integer + * @ return gcd of a and b + */ template -IntType gcd(IntType a, IntType b) { +IntType gcd(IntType a, IntType b) +{ while (b != 0) { IntType tmp = b; - b = a % b; - a = tmp; + b = a % b; + a = tmp; } return a; } diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh index a4ebcc9900..e3b324d030 100644 --- a/cpp/include/raft/device_atomics.cuh +++ b/cpp/include/raft/device_atomics.cuh @@ -39,9 +39,9 @@ namespace detail { /* @brief binary `sum` operator */ struct DeviceSum { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs + rhs; } }; @@ -49,7 +49,8 @@ struct DeviceSum { /* @brief binary `min` operator */ struct DeviceMin { template - __device__ T operator()(const T& lhs, const T& rhs) { + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs < rhs ? lhs : rhs; } }; @@ -57,43 +58,44 @@ struct DeviceMin { /* @brief binary `max` operator */ struct DeviceMax { template - __device__ T operator()(const T& lhs, const T& rhs) { + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs > rhs ? lhs : rhs; } }; /* @brief binary `product` operator */ struct DeviceProduct { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return lhs * rhs; } }; /* @brief binary `and` operator */ struct DeviceAnd { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs & rhs); } }; /* @brief binary `or` operator */ struct DeviceOr { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs | rhs); } }; /* @brief binary `xor` operator */ struct DeviceXor { - template ::value>* = nullptr> - __device__ T operator()(const T& lhs, const T& rhs) { + template ::value>* = nullptr> + __device__ T operator()(const T& lhs, const T& rhs) + { return (lhs ^ rhs); } }; @@ -103,9 +105,9 @@ struct DeviceXor { #define errmsg_cast "size mismatch." template -__forceinline__ __device__ T_output type_reinterpret(T_input value) { - static_assert(sizeof(T_output) == sizeof(T_input), - "type_reinterpret for different size"); +__forceinline__ __device__ T_output type_reinterpret(T_input value) +{ + static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size"); return *(reinterpret_cast(&value)); } @@ -118,25 +120,22 @@ struct genericAtomicOperationImpl; // single byte atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned int; - T_int* address_uint32 = - reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); - T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = T((old >> shift) & 0xff); - uint8_t updating_value = - type_reinterpret(op(target_value, update_value)); - T_int new_value = - (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = T((old >> shift) & 0xff); + uint8_t updating_value = type_reinterpret(op(target_value, update_value)); + T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return T((old >> shift) & 0xff); @@ -146,26 +145,24 @@ struct genericAtomicOperationImpl { // 2 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { - using T_int = unsigned int; + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { + using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = reinterpret_cast( - reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = + reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; do { - assumed = old; - T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); - uint16_t updating_value = - type_reinterpret(op(target_value, update_value)); - - T_int new_value = (is_32_align) - ? (old & 0xffff0000) | updating_value - : (old & 0xffff) | (T_int(updating_value) << 16); - old = atomicCAS(address_uint32, assumed, new_value); + assumed = old; + T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); + uint16_t updating_value = type_reinterpret(op(target_value, update_value)); + + T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value + : (old & 0xffff) | (T_int(updating_value) << 16); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return (is_32_align) ? T(old & 0xffff) : T(old >> 16); @@ -176,20 +173,18 @@ struct genericAtomicOperationImpl { // 4 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned int; T old_value = *addr; T assumed{old_value}; if constexpr (std::is_same{} && (std::is_same{})) { - if (isnan(update_value)) { - return old_value; - } + if (isnan(update_value)) { return old_value; } } do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -206,17 +201,13 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = float; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMax op) { - if (isnan(update_value)) { - return *addr; - } + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { + if (isnan(update_value)) { return *addr; } - T old = - (update_value >= 0) - ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value))) - : __uint_as_float( - atomicMin((unsigned int*)addr, __float_as_uint(update_value))); + T old = (update_value >= 0) + ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value))) + : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(update_value))); return old; } @@ -225,8 +216,8 @@ struct genericAtomicOperationImpl { // 8 bytes atomic operation template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - Op op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -234,7 +225,7 @@ struct genericAtomicOperationImpl { T assumed{old_value}; do { - assumed = old_value; + assumed = old_value; const T new_value = op(old_value, update_value); T_int ret = atomicCAS(reinterpret_cast(addr), @@ -250,8 +241,8 @@ struct genericAtomicOperationImpl { // ------------------------------------------------------------------------------------------------- // specialized functions for operators -// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is not supproted.) -// `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int +// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is +// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int // CUDA natively supports `unsigned long long int` for `atomicAdd`, @@ -264,12 +255,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -277,12 +267,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -297,12 +286,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = long long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceSum op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAdd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAdd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -310,12 +298,11 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMin op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMin(reinterpret_cast(addr), - type_reinterpret(update_value)); + T ret = atomicMin(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -323,48 +310,44 @@ struct genericAtomicOperationImpl { template <> struct genericAtomicOperationImpl { using T = unsigned long int; - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceMax op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T ret = atomicMax(reinterpret_cast(addr), - type_reinterpret(update_value)); + T ret = atomicMax(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceAnd op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicAnd(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicAnd(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceOr op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicOr(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicOr(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; template struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, - DeviceXor op) { + __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); - T_int ret = atomicXor(reinterpret_cast(addr), - type_reinterpret(update_value)); + T_int ret = atomicXor(reinterpret_cast(addr), type_reinterpret(update_value)); return type_reinterpret(ret); } }; @@ -377,13 +360,12 @@ struct typesAtomicCASImpl; template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; - T_int shift = ((reinterpret_cast(addr) & 3) * 8); - T_int* address_uint32 = - reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); + T_int shift = ((reinterpret_cast(addr) & 3) * 8); + T_int* address_uint32 = reinterpret_cast(addr - (reinterpret_cast(addr) & 3)); // the 'target_value' in `old` can be different from `compare` // because other thread may update the value @@ -394,15 +376,14 @@ struct typesAtomicCASImpl { uint8_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = T((old >> shift) & 0xff); // have to compare `target_value` and `compare` before calling atomicCAS // the `target_value` in `old` can be different with `compare` if (target_value != compare) break; - T_int new_value = - (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); - old = atomicCAS(address_uint32, assumed, new_value); + T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return target_value; @@ -411,13 +392,13 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; bool is_32_align = (reinterpret_cast(addr) & 2) ? false : true; - T_int* address_uint32 = reinterpret_cast( - reinterpret_cast(addr) - (is_32_align ? 0 : 2)); + T_int* address_uint32 = + reinterpret_cast(reinterpret_cast(addr) - (is_32_align ? 0 : 2)); T_int old = *address_uint32; T_int assumed; @@ -425,12 +406,12 @@ struct typesAtomicCASImpl { uint16_t u_val = type_reinterpret(update_value); do { - assumed = old; + assumed = old; target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); if (target_value != compare) break; - T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val - : (old & 0xffff) | (T_int(u_val) << 16); + T_int new_value = + (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16); old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); @@ -440,8 +421,8 @@ struct typesAtomicCASImpl { template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned int; T_int ret = atomicCAS(reinterpret_cast(addr), @@ -454,8 +435,8 @@ struct typesAtomicCASImpl { // 8 bytes atomic operation template struct typesAtomicCASImpl { - __forceinline__ __device__ T operator()(T* addr, T const& compare, - T const& update_value) { + __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value) + { using T_int = unsigned long long int; static_assert(sizeof(T) == sizeof(T_int), errmsg_cast); @@ -487,11 +468,10 @@ struct typesAtomicCASImpl { * @returns The old value at `address` * -------------------------------------------------------------------------**/ template -typename std::enable_if_t::value, T> __forceinline__ - __device__ - genericAtomicOperation(T* address, T const& update_value, BinaryOp op) { - auto fun = - raft::device_atomics::detail::genericAtomicOperationImpl{}; +typename std::enable_if_t::value, T> __forceinline__ __device__ +genericAtomicOperation(T* address, T const& update_value, BinaryOp op) +{ + auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -499,11 +479,11 @@ typename std::enable_if_t::value, T> __forceinline__ template __forceinline__ __device__ bool genericAtomicOperation(bool* address, bool const& update_value, - BinaryOp op) { + BinaryOp op) +{ using T = bool; // don't use underlying type to apply operation for bool - auto fun = - raft::device_atomics::detail::genericAtomicOperationImpl{}; + auto fun = raft::device_atomics::detail::genericAtomicOperationImpl{}; return T(fun(address, update_value, op)); } @@ -525,9 +505,9 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address, * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicAdd(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceSum{}); +__forceinline__ __device__ T atomicAdd(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{}); } /** @@ -546,9 +526,9 @@ __forceinline__ __device__ T atomicAdd(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMin(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceMin{}); +__forceinline__ __device__ T atomicMin(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{}); } /** @@ -567,9 +547,9 @@ __forceinline__ __device__ T atomicMin(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicMax(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceMax{}); +__forceinline__ __device__ T atomicMax(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{}); } /** @@ -589,9 +569,9 @@ __forceinline__ __device__ T atomicMax(T* address, T val) { * @returns The old value at `address` */ template -__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { - return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, - val); +__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) +{ + return raft::device_atomics::detail::typesAtomicCASImpl()(address, compare, val); } /** @@ -609,11 +589,10 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicAnd(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceAnd{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicAnd(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{}); } /** @@ -631,11 +610,10 @@ __forceinline__ __device__ T atomicAnd(T* address, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicOr(T* address, T val) { - return raft::genericAtomicOperation(address, val, - raft::device_atomics::detail::DeviceOr{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicOr(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{}); } /** @@ -653,9 +631,8 @@ __forceinline__ __device__ T atomicOr(T* address, T val) { * * @returns The old value at `address` */ -template ::value, T>* = nullptr> -__forceinline__ __device__ T atomicXor(T* address, T val) { - return raft::genericAtomicOperation( - address, val, raft::device_atomics::detail::DeviceXor{}); +template ::value, T>* = nullptr> +__forceinline__ __device__ T atomicXor(T* address, T val) +{ + return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{}); } diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh index c4c384c45f..46edf0bf47 100644 --- a/cpp/include/raft/distance/detail/canberra.cuh +++ b/cpp/include/raft/distance/detail/canberra.cuh @@ -45,75 +45,108 @@ namespace detail { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch work */ -template -static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void canberraImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - const auto add = raft::myAbs(x) + raft::myAbs(y); + const auto add = raft::myAbs(x) + raft::myAbs(y); // deal with potential for 0 in denominator by // forcing 1/0 instead acc += ((add != 0) * diff / (add + (add == 0))); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto canberraRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); + auto canberraRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraRowMajor); canberraRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto canberraColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); + auto canberraColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, canberraColMajor); canberraColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void canberra(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + canberraImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - canberraImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + canberraImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { canberraImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -138,16 +171,25 @@ void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void canberraImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - canberraOutType; + typedef typename std::conditional::type canberraOutType; Index_ lda, ldb, ldd; - canberraOutType *pDcast = reinterpret_cast(pD); + canberraOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; canberra( diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh index 77fba28310..99b314bd08 100644 --- a/cpp/include/raft/distance/detail/chebyshev.cuh +++ b/cpp/include/raft/distance/detail/chebyshev.cuh @@ -44,72 +44,105 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void chebyshevImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { const auto diff = raft::L1Op()(x - y); - acc = raft::myMax(acc, diff); + acc = raft::myMax(acc, diff); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto chebyshevRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - chebyshevRowMajor); + auto chebyshevRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevRowMajor); chebyshevRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto chebyshevColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - chebyshevColMajor); + auto chebyshevColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, chebyshevColMajor); chebyshevColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void chebyshev(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + chebyshevImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - chebyshevImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + chebyshevImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { chebyshevImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -134,16 +167,25 @@ void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream cuda stream to launch work * @param[in] isRowMajor whether the input and output matrices are row major */ -template -void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void chebyshevImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - chebyshevOutType; + typedef typename std::conditional::type chebyshevOutType; Index_ lda, ldb, ldd; - chebyshevOutType *pDcast = reinterpret_cast(pD); + chebyshevOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; chebyshev( diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh index cee986997a..159f9ab580 100644 --- a/cpp/include/raft/distance/detail/correlation.cuh +++ b/cpp/include/raft/distance/detail/correlation.cuh @@ -47,69 +47,81 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, const DataT *x2n, const DataT *y2n, - IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, - IdxT ldd, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +static void correlationImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + const DataT* x2n, + const DataT* y2n, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation auto epilog_lambda = [x2n, y2n, m, n, k] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh]; extern __shared__ char smem[]; - DataT *sx2Norm = - (DataT *)(&smem[KPolicy::SmemSize + - (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]); - DataT *sy2Norm = (&sx2Norm[KPolicy::Mblk]); + DataT* sx2Norm = + (DataT*)(&smem[KPolicy::SmemSize + (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]); + DataT* sy2Norm = (&sx2Norm[KPolicy::Mblk]); // Load x & y norms required by this threadblock in shmem buffer if (gridStrideX == blockIdx.x * KPolicy::Nblk) { for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) { - auto idx = gridStrideY + i; + auto idx = gridStrideY + i; sx2Norm[i] = idx < m ? x2n[idx] : 0; } } for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) { - auto idx = gridStrideX + i; + auto idx = gridStrideX + i; sy2Norm[i] = idx < n ? y2n[idx] : 0; } __syncthreads(); #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { - regx2n[i] = - sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)]; + regx2n[i] = sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)]; } #pragma unroll for (int i = 0; i < KPolicy::AccColsPerTh; ++i) { - regy2n[i] = - sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)]; + regy2n[i] = sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)]; } #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { - auto numer = k * acc[i][j] - (regxn[i] * regyn[j]); + auto numer = k * acc[i][j] - (regxn[i] * regyn[j]); auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]); auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]); @@ -121,46 +133,68 @@ static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - constexpr auto correlationRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - correlationRowMajor); + constexpr auto correlationRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, correlationRowMajor); correlationRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - constexpr auto correlationColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - correlationColMajor); + constexpr auto correlationColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, correlationColMajor); correlationColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, const DataT *x2n, const DataT *y2n, - OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) { +template +void correlation(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + const DataT* x2n, + const DataT* y2n, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - correlationImpl(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, - dOutput, fin_op, stream); + correlationImpl( + x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - correlationImpl(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, - dOutput, fin_op, stream); + correlationImpl( + x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { correlationImpl( x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -185,63 +219,118 @@ void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void correlationImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, AccType *workspace, size_t &worksize, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) { +template +void correlationImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + AccType* workspace, + size_t& worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - correlationOutType; + typedef typename std::conditional::type correlationOutType; Index_ lda, ldb, ldd; - correlationOutType *pDcast = reinterpret_cast(pD); + correlationOutType* pDcast = reinterpret_cast(pD); ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) || (worksize < 2 * m * sizeof(AccType))), "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); - AccType *norm_col_vec = workspace; - AccType *norm_row_vec = workspace; - AccType *sq_norm_col_vec = workspace; - AccType *sq_norm_row_vec = workspace; + AccType* norm_col_vec = workspace; + AccType* norm_row_vec = workspace; + AccType* sq_norm_col_vec = workspace; + AccType* sq_norm_row_vec = workspace; if (pA != pB) { norm_row_vec += m; - raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true, - stream, false, raft::Nop(), + raft::linalg::reduce(norm_col_vec, + pA, + k, + m, + (AccType)0, + isRowMajor, + true, + stream, + false, + raft::Nop(), raft::Sum()); - raft::linalg::reduce(norm_row_vec, pB, k, n, (AccType)0, isRowMajor, true, - stream, false, raft::Nop(), + raft::linalg::reduce(norm_row_vec, + pB, + k, + n, + (AccType)0, + isRowMajor, + true, + stream, + false, + raft::Nop(), raft::Sum()); sq_norm_col_vec += (m + n); sq_norm_row_vec = sq_norm_col_vec + m; - raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, - isRowMajor, stream); - raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, - isRowMajor, stream); + raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream); + raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream); } else { - raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true, - stream, false, raft::Nop(), + raft::linalg::reduce(norm_col_vec, + pA, + k, + m, + (AccType)0, + isRowMajor, + true, + stream, + false, + raft::Nop(), raft::Sum()); sq_norm_col_vec += m; sq_norm_row_vec = sq_norm_col_vec; - raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, - isRowMajor, stream); + raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream); } if (isRowMajor) { lda = k, ldb = k, ldd = n; - correlation( - m, n, k, lda, ldb, ldd, pA, pB, norm_col_vec, norm_row_vec, - sq_norm_col_vec, sq_norm_row_vec, pDcast, fin_op, stream); + correlation(m, + n, + k, + lda, + ldb, + ldd, + pA, + pB, + norm_col_vec, + norm_row_vec, + sq_norm_col_vec, + sq_norm_row_vec, + pDcast, + fin_op, + stream); } else { lda = n, ldb = m, ldd = m; - correlation(n, m, k, lda, ldb, ldd, pB, pA, norm_row_vec, - norm_col_vec, sq_norm_row_vec, sq_norm_col_vec, pDcast, - fin_op, stream); + correlation(n, + m, + k, + lda, + ldb, + ldd, + pB, + pA, + norm_row_vec, + norm_col_vec, + sq_norm_row_vec, + sq_norm_col_vec, + pDcast, + fin_op, + stream); } } diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh index 900e045edc..5684fd0a16 100644 --- a/cpp/include/raft/distance/detail/cosine.cuh +++ b/cpp/include/raft/distance/detail/cosine.cuh @@ -25,7 +25,7 @@ namespace detail { /** * @brief the cosine distance matrix calculation implementer - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -50,30 +50,43 @@ namespace detail { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, - IdxT ldd, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void cosineImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -86,43 +99,66 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto cosineRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); + auto cosineRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineRowMajor); cosineRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto cosineColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); + auto cosineColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, cosineColMajor); cosineColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, const DataT *yn, - OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) { +template +void cosine(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream); + cosineImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - cosineImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream); + cosineImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { cosineImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -131,7 +167,7 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the expanded cosine distance matrix calculation - * It computes the following equation: + * It computes the following equation: * C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2))) * @tparam IType input data-type (for A and B matrices) * @tparam AccType accumulation data-type @@ -152,12 +188,23 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, AccType *workspace, - size_t worksize, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void cosineAlgo1(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + AccType* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); }; // Wrap fin_op to allow computing 1 - pA before calling fin_op @@ -166,39 +213,33 @@ void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, }; typedef std::is_same is_bool; - typedef typename std::conditional::type - CosOutType; - CosOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type CosOutType; + CosOutType* pDcast = reinterpret_cast(pD); - ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || - (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT( + !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType *col_vec = workspace; - InType *row_vec = workspace; + InType* col_vec = workspace; + InType* row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; cosine( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, - stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream); } else { lda = n, ldb = m, ldd = m; - cosine(n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, - wrapped_fin_op, stream); + cosine( + n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream); } } diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh index 199dc73fb6..91838e8bfa 100644 --- a/cpp/include/raft/distance/detail/distance.cuh +++ b/cpp/include/raft/distance/detail/distance.cuh @@ -85,211 +85,461 @@ enum DistanceType : unsigned short { }; namespace { -template struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg = 2.0f) {} + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg = 2.0f) + { + } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType) { - raft::distance::detail::euclideanAlgo1( - m, n, k, x, y, dist, false, (AccType *)workspace, worksize, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::euclideanAlgo1( + m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType) { - raft::distance::detail::euclideanAlgo1( - m, n, k, x, y, dist, true, (AccType *)workspace, worksize, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::euclideanAlgo1( + m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType) { - raft::distance::detail::cosineAlgo1(m, n, k, x, y, dist, - (AccType *)workspace, worksize, - fin_op, stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::cosineAlgo1( + m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::euclideanAlgo2( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::euclideanAlgo2( m, n, k, x, y, dist, false, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::euclideanAlgo2( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::euclideanAlgo2( m, n, k, x, y, dist, true, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::l1Impl(m, n, k, x, y, dist, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::l1Impl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::chebyshevImpl(m, n, k, x, y, dist, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::chebyshevImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::hellingerImpl(m, n, k, x, y, dist, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::hellingerImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType metric_arg) { - raft::distance::detail::minkowskiImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) + { + raft::distance::detail::minkowskiImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::canberraImpl(m, n, k, x, y, dist, fin_op, - stream, isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::canberraImpl( + m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::hammingUnexpandedImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::hammingUnexpandedImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::jensenShannonImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::jensenShannonImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::russellRaoImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::russellRaoImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor, InType) { - raft::distance::detail::klDivergenceImpl( +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void*, + size_t, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::klDivergenceImpl( m, n, k, x, y, dist, fin_op, stream, isRowMajor); } }; -template -struct DistanceImpl { - void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n, - Index_ k, void *workspace, size_t worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType) { - raft::distance::detail::correlationImpl( - m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream, - isRowMajor); +template +struct DistanceImpl { + void run(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType) + { + raft::distance::detail::correlationImpl( + m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor); } }; @@ -320,53 +570,71 @@ struct DistanceImplOutType fin_op(AccType in, int g_idx);. If one needs * any other parameters, feel free to pass them via closure. */ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { - DistanceImpl - distImpl; - distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, - isRowMajor, metric_arg); +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ + DistanceImpl distImpl; + distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } /** - * @brief Evaluate pairwise distances for the simple use case - * @tparam DistanceType which distance to evaluate - * @tparam InType input argument type - * @tparam AccType accumulation type - * @tparam OutType output type - * @tparam Index_ Index type - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param workspace temporary workspace needed for computations - * @param worksize number of bytes of the workspace - * @param stream cuda stream - * @param isRowMajor whether the matrices are row-major or col-major - * - * @note if workspace is passed as nullptr, this will return in - * worksize, the number of bytes of workspace required - */ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { - auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { - return d_val; - }; - distance(x, y, dist, m, n, k, workspace, worksize, default_fin_op, - stream, isRowMajor, metric_arg); + * @brief Evaluate pairwise distances for the simple use case + * @tparam DistanceType which distance to evaluate + * @tparam InType input argument type + * @tparam AccType accumulation type + * @tparam OutType output type + * @tparam Index_ Index type + * @param x first set of points + * @param y second set of points + * @param dist output distance matrix + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * @param workspace temporary workspace needed for computations + * @param worksize number of bytes of the workspace + * @param stream cuda stream + * @param isRowMajor whether the matrices are row-major or col-major + * + * @note if workspace is passed as nullptr, this will return in + * worksize, the number of bytes of workspace required + */ +template +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ + auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; }; + distance( + x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg); CUDA_CHECK(cudaPeekAtLastError()); } @@ -386,14 +654,16 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, * @note If the specifed distanceType doesn't need the workspace at all, it * returns 0. */ -template -size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, - Index_ k) { - size_t worksize = 0; - constexpr bool is_allocated = - (distanceType <= raft::distance::DistanceType::CosineExpanded) || - (distanceType == raft::distance::DistanceType::CorrelationExpanded); +template +size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k) +{ + size_t worksize = 0; + constexpr bool is_allocated = (distanceType <= raft::distance::DistanceType::CosineExpanded) || + (distanceType == raft::distance::DistanceType::CorrelationExpanded); constexpr int numOfBuffers = (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1; @@ -425,17 +695,21 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, * @param isRowMajor whether the matrices are row-major or col-major */ template -void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m, - Index_ n, Index_ k, - rmm::device_uvector &workspace, - cudaStream_t stream, bool isRowMajor, - Type metric_arg = 2.0f) { - auto worksize = - getWorkspaceSize(x, y, m, n, k); +void pairwise_distance_impl(const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, + rmm::device_uvector& workspace, + cudaStream_t stream, + bool isRowMajor, + Type metric_arg = 2.0f) +{ + auto worksize = getWorkspaceSize(x, y, m, n, k); workspace.resize(worksize, stream); - distance(x, y, dist, m, n, k, - workspace.data(), worksize, - stream, isRowMajor, metric_arg); + distance( + x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg); } /** @} */ }; // namespace detail diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh index 8b8882c244..1166543f8c 100644 --- a/cpp/include/raft/distance/detail/euclidean.cuh +++ b/cpp/include/raft/distance/detail/euclidean.cuh @@ -49,30 +49,44 @@ namespace detail { * @param fin_op the final gemm epilogue lambda * @param stream cuda stream to launch cuda operations. */ -template -void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, - IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanExpImpl(const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -94,47 +108,68 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn, constexpr size_t shmemSize = KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)); if (isRowMajor) { - auto euclideanExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); + auto euclideanExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpRowMajor); euclideanExpRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto euclideanExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); + auto euclideanExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, shmemSize, euclideanExpColMajor); euclideanExpColMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, - fin_op); + x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, const DataT *xn, - const DataT *yn, bool sqrt, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void euclideanExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, - dOutput, fin_op, stream); + euclideanExpImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanExpImpl(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, - dOutput, fin_op, stream); + euclideanExpImpl( + x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else { euclideanExpImpl( x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -162,53 +197,59 @@ void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, bool enable_sqrt, - AccType *workspace, size_t &worksize, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { +template +void euclideanAlgo1(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + bool enable_sqrt, + AccType* workspace, + size_t& worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ auto norm_op = [] __device__(InType in) { return in; }; typedef std::is_same is_bool; - typedef typename std::conditional::type - ExpOutType; - ExpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type ExpOutType; + ExpOutType* pDcast = reinterpret_cast(pD); - ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || - (worksize < m * sizeof(AccType))), - "workspace size error"); + ASSERT( + !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))), + "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); Index_ lda, ldb, ldd; - InType *col_vec = workspace; - InType *row_vec = workspace; + InType* col_vec = workspace; + InType* row_vec = workspace; if (pA != pB) { row_vec += m; - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } if (isRowMajor) { lda = k, ldb = k, ldd = n; euclideanExp( - m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, - fin_op, stream); + m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; euclideanExp( - n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, - fin_op, stream); + n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream); } } /** - * @brief the unexpanded euclidean distance matrix calculation + * @brief the unexpanded euclidean distance matrix calculation * It computes the following equation: cij = op((ai-bj)^2) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -228,16 +269,30 @@ void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA, * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanUnExpImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -248,10 +303,11 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [sqrt] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { if (sqrt) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -264,48 +320,68 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - auto euclideanUnExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - euclideanUnExpRowMajor); + auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor); euclideanUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto euclideanUnExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - euclideanUnExpColMajor); + auto euclideanUnExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, euclideanUnExpColMajor); euclideanUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, bool sqrt, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void euclideanUnExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + bool sqrt, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, - fin_op, stream); + euclideanUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - euclideanUnExpImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, - fin_op, stream); + euclideanUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); } else { euclideanUnExpImpl( x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream); @@ -331,15 +407,25 @@ void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, bool enable_sqrt, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) { +template +void euclideanAlgo2(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + bool enable_sqrt, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - UnExpOutType; - UnExpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type UnExpOutType; + UnExpOutType* pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh index ca8f729a68..9373992ada 100644 --- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh @@ -36,24 +36,24 @@ template struct KVPMinReduceImpl { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce template struct MinAndDistanceReduceOpImpl { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, KVP* out, const KVP& other) { + DI void operator()(LabelT rid, KVP* out, const KVP& other) + { if (other.value < out->value) { - out->key = other.key; + out->key = other.key; out->value = other.value; } } - DI void init(KVP* out, DataT maxVal) { - out->key = -1; + DI void init(KVP* out, DataT maxVal) + { + out->key = -1; out->value = maxVal; } }; @@ -61,38 +61,35 @@ struct MinAndDistanceReduceOpImpl { template struct MinReduceOpImpl { typedef typename cub::KeyValuePair KVP; - DI void operator()(LabelT rid, DataT* out, const KVP& other) { - if (other.value < *out) { - *out = other.value; - } + DI void operator()(LabelT rid, DataT* out, const KVP& other) + { + if (other.value < *out) { *out = other.value; } } DI void init(DataT* out, DataT maxVal) { *out = maxVal; } }; template -__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) { +__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) +{ auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < m) { - redOp.init(min + tid, maxVal); - } + if (tid < m) { redOp.init(min + tid, maxVal); } } template -void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, - cudaStream_t stream) { +void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, cudaStream_t stream) +{ auto blks = raft::ceildiv(m, 256); - initKernel - <<>>(min, m, maxVal, redOp); + initKernel<<>>(min, m, maxVal, redOp); } // TODO: specialize this function for MinAndDistanceReduceOp // with atomicCAS of 64 bit which will eliminate mutex and shfls -template -DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, - IdxT m, IdxT gridStrideY) { - const auto lid = threadIdx.x % raft::WarpSize; +template +DI void updateReducedVal( + int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY) +{ + const auto lid = threadIdx.x % raft::WarpSize; const auto accrowid = threadIdx.x / P::AccThCols; // for now have first lane from each warp update a unique output row. This @@ -117,21 +114,38 @@ DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, if (j < (raft::WarpSize / P::AccThCols) - 1) { #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); + auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols); auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols); - val[i] = {tmpkey, tmpvalue}; + val[i] = {tmpkey, tmpvalue}; } } } } -template -__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( - OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn, - IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp, - KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) { +__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + DataT maxVal, + int* mutex, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + CoreLambda core_op, + FinalLambda fin_op) +{ extern __shared__ char smem[]; typedef cub::KeyValuePair KVPair; @@ -144,7 +158,9 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( // epilogue operation lambda for final value calculation auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__( DataT acc[P::AccRowsPerTh][P::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { KVPReduceOpT pairRed_op(pairRedOp); @@ -173,72 +189,105 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel( #pragma unroll for (int j = 0; j < P::AccColsPerTh; ++j) { auto tmpkey = acccolid + j * P::AccThCols + gridStrideX; - KVPair tmp = {tmpkey, acc[i][j]}; + KVPair tmp = {tmpkey, acc[i][j]}; if (tmpkey < n) { - val[i] = - pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); } } } }; - auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val, - maxVal] __device__(IdxT gridStrideY) { - KVPReduceOpT pairRed_op(pairRedOp); - ReduceOpT red_op(redOp); + auto rowEpilog_lambda = + [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) { + KVPReduceOpT pairRed_op(pairRedOp); + ReduceOpT red_op(redOp); - const auto accrowid = threadIdx.x / P::AccThCols; - const auto lid = raft::laneId(); + const auto accrowid = threadIdx.x / P::AccThCols; + const auto lid = raft::laneId(); // reduce #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { + for (int i = 0; i < P::AccRowsPerTh; ++i) { #pragma unroll - for (int j = P::AccThCols / 2; j > 0; j >>= 1) { - auto tmpkey = raft::shfl(val[i].key, lid + j); - auto tmpvalue = raft::shfl(val[i].value, lid + j); - KVPair tmp = {tmpkey, tmpvalue}; - val[i] = - pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + for (int j = P::AccThCols / 2; j > 0; j >>= 1) { + auto tmpkey = raft::shfl(val[i].key, lid + j); + auto tmpvalue = raft::shfl(val[i].value, lid + j); + KVPair tmp = {tmpkey, tmpvalue}; + val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); + } } - } - updateReducedVal(mutex, min, val, red_op, - m, gridStrideY); + updateReducedVal(mutex, min, val, red_op, m, gridStrideY); // reset the val array. #pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; - } - }; + for (int i = 0; i < P::AccRowsPerTh; ++i) { + val[i] = {-1, maxVal}; + } + }; IdxT lda = k, ldb = k, ldd = n; - PairwiseDistances - obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op, - epilog_lambda, fin_op, rowEpilog_lambda); + PairwiseDistances + obj(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + xn, + yn, + nullptr, + smem, + core_op, + epilog_lambda, + fin_op, + rowEpilog_lambda); obj.run(); } -template -void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { +template +void fusedL2NNImpl(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + int* workspace, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream) +{ typedef typename linalg::Policy4x4::Policy P; dim3 blk(P::Nthreads); - auto nblks = raft::ceildiv(m, P::Nthreads); + auto nblks = raft::ceildiv(m, P::Nthreads); constexpr auto maxVal = std::numeric_limits::max(); typedef cub::KeyValuePair KVPair; // Accumulation operation lambda - auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; }; CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); if (initOutBuffer) { @@ -249,25 +298,34 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn, auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; }; - constexpr size_t shmemSize = - P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); + constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); if (sqrt) { - auto fusedL2NNSqrt = - fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); + auto fusedL2NNSqrt = fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NNSqrt); fusedL2NNSqrt<<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, - core_lambda, fin_op); + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); } else { - auto fusedL2NN = - fusedL2NNkernel; - dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); - fusedL2NN<<>>(min, x, y, xn, yn, m, n, k, - maxVal, workspace, redOp, - pairRedOp, core_lambda, fin_op); + auto fusedL2NN = fusedL2NNkernel; + dim3 grid = launchConfigGenerator

(m, n, shmemSize, fusedL2NN); + fusedL2NN<<>>( + min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh index 0169ba33a2..886b9d1426 100644 --- a/cpp/include/raft/distance/detail/hamming.cuh +++ b/cpp/include/raft/distance/detail/hamming.cuh @@ -23,7 +23,7 @@ namespace detail { /** * @brief the Hamming distance matrix using the unexpanded form: - * It computes the following equation: + * It computes the following equation: Cij = sum(x_i != y_i) / k * * @tparam DataT input data-type (for A and B matrices) @@ -47,30 +47,41 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m, - IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +static void hammingUnexpandedImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += (x != y); - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += (x != y); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [k] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [k] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { const DataT one_over_k = DataT(1.0) / k; #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -82,46 +93,65 @@ static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m, }; if (isRowMajor) { - auto hammingUnexpandedRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hammingUnexpandedRowMajor); + auto hammingUnexpandedRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hammingUnexpandedRowMajor); hammingUnexpandedRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto hammingUnexpandedColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hammingUnexpandedColMajor); + auto hammingUnexpandedColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hammingUnexpandedColMajor); hammingUnexpandedColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void hammingUnexpanded(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - hammingUnexpandedImpl(x, y, m, n, k, lda, ldb, ldd, - dOutput, fin_op, stream); + hammingUnexpandedImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - hammingUnexpandedImpl(x, y, m, n, k, lda, ldb, ldd, - dOutput, fin_op, stream); + hammingUnexpandedImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { hammingUnexpandedImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -130,7 +160,7 @@ void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Hamming Unexpanded distance matrix calculation - * It computes the following equation: + * It computes the following equation: Cij = sum(x_i != y_i) / k * * @tparam InType input data-type (for A and B matrices) @@ -148,28 +178,35 @@ void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void hammingUnexpandedImpl(int m, int n, int k, const InType *pA, - const InType *pB, OutType *pD, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor) { +template +void hammingUnexpandedImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - hammingUnexpandedOutType; + typedef + typename std::conditional::type hammingUnexpandedOutType; Index_ lda, ldb, ldd; - hammingUnexpandedOutType *pDcast = - reinterpret_cast(pD); + hammingUnexpandedOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; - hammingUnexpanded(m, n, k, lda, ldb, ldd, pA, pB, pDcast, - fin_op, stream); + hammingUnexpanded( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; - hammingUnexpanded(n, m, k, lda, ldb, ldd, pB, pA, - pDcast, fin_op, stream); + hammingUnexpanded( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); } } diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh index 933d850dbf..189bbed491 100644 --- a/cpp/include/raft/distance/detail/hellinger.cuh +++ b/cpp/include/raft/distance/detail/hellinger.cuh @@ -24,7 +24,7 @@ namespace detail { /** * @brief the Hellinger distance matrix using the expanded form: - * It computes the following equation: + * It computes the following equation: cij = sqrt(1 - sum(sqrt(x_k * y_k))) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -52,29 +52,40 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void hellingerImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); - auto unaryOp_lambda = [] __device__(DataT input) { - return raft::mySqrt(input); - }; + auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); }; // First sqrt x and y raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); + (DataT*)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); + (DataT*)y, y, n * k, unaryOp_lambda, stream); } // Accumulation operation lambda @@ -85,71 +96,91 @@ static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll for (int j = 0; j < KPolicy::AccColsPerTh; ++j) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - const auto finalVal = (1 - acc[i][j]); + const auto finalVal = (1 - acc[i][j]); const auto rectifier = (!signbit(finalVal)); - acc[i][j] = raft::mySqrt(rectifier * finalVal); + acc[i][j] = raft::mySqrt(rectifier * finalVal); } } }; if (isRowMajor) { - auto hellingerRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hellingerRowMajor); + auto hellingerRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerRowMajor); hellingerRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto hellingerColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - hellingerColMajor); + auto hellingerColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, hellingerColMajor); hellingerColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } // Revert sqrt of x and y raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); + (DataT*)x, x, m * k, unaryOp_lambda, stream); if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); + (DataT*)y, y, n * k, unaryOp_lambda, stream); } CUDA_CHECK(cudaGetLastError()); } -template -void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void hellinger(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + hellingerImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - hellingerImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + hellingerImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { hellingerImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -158,7 +189,7 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Hellinger distance matrix calculation - * It computes the following equation: + * It computes the following equation: sqrt(1 - sum(sqrt(x_k * y_k)) * This distance computation modifies A and B by computing a sqrt * and then performing a `pow(x, 2)` to convert it back. Because of this, @@ -180,16 +211,25 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void hellingerImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - hellingerOutType; + typedef typename std::conditional::type hellingerOutType; Index_ lda, ldb, ldd; - hellingerOutType *pDcast = reinterpret_cast(pD); + hellingerOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; hellinger( diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh index 1e39f39682..b3240fe398 100644 --- a/cpp/include/raft/distance/detail/jensen_shannon.cuh +++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh @@ -23,7 +23,7 @@ namespace detail { /** * @brief the Jensen Shannon distance matrix: - * It computes the following equation: + * It computes the following equation: Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) * @@ -48,37 +48,49 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +static void jensenShannonImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - const DataT m = 0.5f * (x + y); + const DataT m = 0.5f * (x + y); const bool m_zero = (m == 0); - const auto logM = (!m_zero) * raft::myLog(m + m_zero); + const auto logM = (!m_zero) * raft::myLog(m + m_zero); const bool x_zero = (x == 0); const bool y_zero = (y == 0); - acc += (-x * (logM - raft::myLog(x + x_zero))) + - (-y * (logM - raft::myLog(y + y_zero))); + acc += (-x * (logM - raft::myLog(x + x_zero))) + (-y * (logM - raft::myLog(y + y_zero))); }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -89,46 +101,65 @@ static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; if (isRowMajor) { - auto jensenShannonRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - jensenShannonRowMajor); + auto jensenShannonRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, jensenShannonRowMajor); jensenShannonRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto jensenShannonColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - jensenShannonColMajor); + auto jensenShannonColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, jensenShannonColMajor); jensenShannonColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void jensenShannon(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - jensenShannonImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + jensenShannonImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - jensenShannonImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + jensenShannonImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { jensenShannonImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -137,7 +168,7 @@ void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Jensen Shannon distance matrix calculation - * It computes the following equation: + * It computes the following equation: Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) * @@ -156,26 +187,34 @@ void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void jensenShannonImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void jensenShannonImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - jensenShannonOutType; + typedef typename std::conditional::type jensenShannonOutType; Index_ lda, ldb, ldd; - jensenShannonOutType *pDcast = reinterpret_cast(pD); + jensenShannonOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; - jensenShannon(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + jensenShannon( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; - jensenShannon(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, - stream); + jensenShannon( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); } } } // namespace detail diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh index 5a18ba1670..31127a4d8d 100644 --- a/cpp/include/raft/distance/detail/kl_divergence.cuh +++ b/cpp/include/raft/distance/detail/kl_divergence.cuh @@ -23,7 +23,7 @@ namespace detail { /** * @brief the KL Divergence distance matrix: - * It computes the following equation: + * It computes the following equation: Cij = 0.5 * sum(x * log (x / y)); * This distance computation modifies A or B by computing a log(x) * and then performing a `pow(e, log(x))` to convert it back. Because of this, @@ -51,17 +51,29 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +static void klDivergenceImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -80,13 +92,11 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, if (isRowMajor) { const bool x_zero = (x == 0); const bool y_zero = (y == 0); - acc += - x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero)); + acc += x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero)); } else { const bool y_zero = (y == 0); const bool x_zero = (x == 0); - acc += - y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero)); + acc += y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero)); } }; @@ -102,10 +112,11 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { #pragma unroll @@ -116,79 +127,158 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; if (isRowMajor) { - constexpr auto klDivergenceRowMajor = - pairwiseDistanceMatKernel; + constexpr auto klDivergenceRowMajor = pairwiseDistanceMatKernel; constexpr auto klDivergenceRowMajorXequalY = - pairwiseDistanceMatKernel; + decltype(epilog_lambda), + FinalLambda, + true>; if (x != y) { raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda, stream); - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - klDivergenceRowMajor); - klDivergenceRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + (DataT*)y, y, n * k, unaryOp_lambda, stream); + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceRowMajor); + klDivergenceRowMajor<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + dOutput, + core_lambda, + epilog_lambda, + fin_op); // Now reverse previous log (x) back to x using (e ^ log(x)) raft::linalg::unaryOp( - (DataT *)y, y, n * k, unaryOp_lambda_reverse, stream); + (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream); } else { - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - klDivergenceRowMajorXequalY); - klDivergenceRowMajorXequalY<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, - core_lambda_x_equal_y, epilog_lambda, fin_op); + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceRowMajorXequalY); + klDivergenceRowMajorXequalY<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + dOutput, + core_lambda_x_equal_y, + epilog_lambda, + fin_op); } } else { - constexpr auto klDivergenceColMajor = - pairwiseDistanceMatKernel; + constexpr auto klDivergenceColMajor = pairwiseDistanceMatKernel; constexpr auto klDivergenceColMajorXequalY = - pairwiseDistanceMatKernel; + decltype(epilog_lambda), + FinalLambda, + false>; if (x != y) { raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda, stream); - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - klDivergenceColMajor); - klDivergenceColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + (DataT*)x, x, m * k, unaryOp_lambda, stream); + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceColMajor); + klDivergenceColMajor<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + dOutput, + core_lambda, + epilog_lambda, + fin_op); // Now reverse previous log (x) back to x using (e ^ log(x)) raft::linalg::unaryOp( - (DataT *)x, x, m * k, unaryOp_lambda_reverse, stream); + (DataT*)x, x, m * k, unaryOp_lambda_reverse, stream); } else { - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - klDivergenceColMajorXequalY); - klDivergenceColMajorXequalY<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, - core_lambda_x_equal_y, epilog_lambda, fin_op); + dim3 grid = + launchConfigGenerator(m, n, KPolicy::SmemSize, klDivergenceColMajorXequalY); + klDivergenceColMajorXequalY<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + dOutput, + core_lambda_x_equal_y, + epilog_lambda, + fin_op); } } CUDA_CHECK(cudaGetLastError()); } -template -void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void klDivergence(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - klDivergenceImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + klDivergenceImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - klDivergenceImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + klDivergenceImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { klDivergenceImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -197,7 +287,7 @@ void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the KL Divergence distance matrix calculation - * It computes the following equation: + * It computes the following equation: Cij = 0.5 * sum(x * log (x / y)); * This distance computation modifies A or B by computing a log(x) * and then performing a `pow(e, log(x))` to convert it back. Because of this, @@ -218,25 +308,34 @@ void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void klDivergenceImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void klDivergenceImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - klDivergenceOutType; + typedef typename std::conditional::type klDivergenceOutType; Index_ lda, ldb, ldd; - klDivergenceOutType *pDcast = reinterpret_cast(pD); + klDivergenceOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; - klDivergence(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); + klDivergence( + m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream); } else { lda = n, ldb = m, ldd = m; - klDivergence(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); + klDivergence( + n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream); } } } // namespace detail diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh index 33e9bae206..e444e65d1f 100644 --- a/cpp/include/raft/distance/detail/l1.cuh +++ b/cpp/include/raft/distance/detail/l1.cuh @@ -43,16 +43,29 @@ namespace detail { * @param[output] pD output matrix * @param fin_op the final gemm epilogue lambda */ -template -static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void l1Impl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -63,47 +76,69 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { return; }; + auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { return; }; if (isRowMajor) { - auto l1RowMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); + auto l1RowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1RowMajor); l1RowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto l1ColMajor = - pairwiseDistanceMatKernel; - dim3 grid = - launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); + auto l1ColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, l1ColMajor); l1ColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x, - const DataT *y, OutT *dOutput, FinalLambda fin_op, - cudaStream_t stream) { +template +void l1(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - l1Impl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); + l1Impl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { l1Impl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -131,16 +166,25 @@ void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void l1Impl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void l1Impl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef - typename std::conditional::type L1OutType; + typedef typename std::conditional::type L1OutType; Index_ lda, ldb, ldd; - L1OutType *pDcast = reinterpret_cast(pD); + L1OutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; l1( diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh index 8bd3deb08f..22a183c22c 100644 --- a/cpp/include/raft/distance/detail/minkowski.cuh +++ b/cpp/include/raft/distance/detail/minkowski.cuh @@ -22,7 +22,7 @@ namespace distance { namespace detail { /** - * @brief the unexpanded Minkowski distance matrix calculation + * @brief the unexpanded Minkowski distance matrix calculation * It computes the following equation: cij = sum(|x - y|^p)^(1/p) * @tparam DataT input data-type (for A and B matrices) * @tparam AccT accumulation data-type @@ -45,16 +45,30 @@ namespace detail { * @param[in] stream cuda stream to launch work * @param[in] the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream, DataT p) { +template +void minkowskiUnExpImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream, + DataT p) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); @@ -65,10 +79,11 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; // epilogue operation lambda for final value calculation - auto epilog_lambda = [p] __device__( - AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, - IdxT gridStrideY) { + auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, + IdxT gridStrideY) { const auto one_over_p = 1.0f / p; #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -80,48 +95,68 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, }; if (isRowMajor) { - auto minkowskiUnExpRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - minkowskiUnExpRowMajor); + auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor); minkowskiUnExpRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - auto minkowskiUnExpColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - minkowskiUnExpColMajor); + auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor); minkowskiUnExpColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) { +template +void minkowskiUnExp(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream, + DataT metric_arg) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream, metric_arg); + minkowskiUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - minkowskiUnExpImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, - fin_op, stream, metric_arg); + minkowskiUnExpImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); } else { minkowskiUnExpImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg); @@ -147,15 +182,25 @@ void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] isRowMajor whether the input and output matrices are row major * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. */ -template -void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA, - const InType *pB, OutType *pD, FinalLambda fin_op, - cudaStream_t stream, bool isRowMajor, InType metric_arg) { +template +void minkowskiImpl(Index_ m, + Index_ n, + Index_ k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor, + InType metric_arg) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - LpUnexpOutType; - LpUnexpOutType *pDcast = reinterpret_cast(pD); + typedef typename std::conditional::type LpUnexpOutType; + LpUnexpOutType* pDcast = reinterpret_cast(pD); Index_ lda, ldb, ldd; if (isRowMajor) { diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh index a98bda1541..8fa7801c70 100644 --- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh +++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh @@ -34,11 +34,11 @@ namespace detail { * @tparam OutT output data-type (for C and D matrices) * @tparam IdxT index data-type * @tparam Policy struct which tunes the Contraction kernel - * @tparam CoreLambda tells how to accumulate an x and y into + * @tparam CoreLambda tells how to accumulate an x and y into acc. its signature: template void core_lambda(AccT& acc, const DataT& x, const DataT& y) - * @tparam EpilogueLambda applies an elementwise function to compute final + * @tparam EpilogueLambda applies an elementwise function to compute final values. Its signature is: template void epilogue_lambda (AccT acc[][], DataT* regxn, DataT* regyn); @@ -60,21 +60,27 @@ namespace detail { * @param fin_op the final gemm epilogue lambda */ -template > +template > struct PairwiseDistances : public BaseClass { private: typedef Policy P; - const DataT *xn; - const DataT *yn; - const DataT *const yBase; - OutT *dOutput; - char *smem; + const DataT* xn; + const DataT* yn; + const DataT* const yBase; + OutT* dOutput; + char* smem; CoreLambda core_op; EpilogueLambda epilog_op; FinalLambda fin_op; @@ -84,11 +90,21 @@ struct PairwiseDistances : public BaseClass { public: // Constructor - DI PairwiseDistances(const DataT *_x, const DataT *_y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, - const DataT *_xn, const DataT *_yn, OutT *_dOutput, - char *_smem, CoreLambda _core_op, - EpilogueLambda _epilog_op, FinalLambda _fin_op, + DI PairwiseDistances(const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + IdxT _lda, + IdxT _ldb, + IdxT _ldd, + const DataT* _xn, + const DataT* _yn, + OutT* _dOutput, + char* _smem, + CoreLambda _core_op, + EpilogueLambda _epilog_op, + FinalLambda _fin_op, rowEpilogueLambda _rowEpilog_op) : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem), xn(_xn), @@ -99,9 +115,12 @@ struct PairwiseDistances : public BaseClass { core_op(_core_op), epilog_op(_epilog_op), fin_op(_fin_op), - rowEpilog_op(_rowEpilog_op) {} + rowEpilog_op(_rowEpilog_op) + { + } - DI void run() { + DI void run() + { for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m; gridStrideY += P::Mblk * gridDim.y) { for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n; @@ -115,7 +134,8 @@ struct PairwiseDistances : public BaseClass { } private: - DI void updateIndicesY() { + DI void updateIndicesY() + { const auto stride = P::Nblk * gridDim.x; if (isRowMajor) { this->y += stride * this->ldb; @@ -125,21 +145,23 @@ struct PairwiseDistances : public BaseClass { this->yrowid += stride; } - DI void updateIndicesXY() { + DI void updateIndicesXY() + { const auto stride = P::Mblk * gridDim.y; if (isRowMajor) { this->x += stride * this->lda; this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid; - this->y = yBase + this->yrowid * this->ldb; + this->y = yBase + this->yrowid * this->ldb; } else { this->x += stride; this->yrowid = IdxT(blockIdx.x) * P::Nblk; - this->y = yBase + this->yrowid + this->srowid * this->ldb; + this->y = yBase + this->yrowid + this->srowid * this->ldb; } this->xrowid += stride; } - DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) { + DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) + { // Fetch next grid stride ldg if within range if ((gridStrideX + gridDim.x * P::Nblk) < this->n) { updateIndicesY(); @@ -150,10 +172,9 @@ struct PairwiseDistances : public BaseClass { } } - DI void prolog(IdxT gridStrideX, IdxT gridStrideY) { - if (gridStrideX == blockIdx.x * P::Nblk) { - this->ldgXY(0); - } + DI void prolog(IdxT gridStrideX, IdxT gridStrideY) + { + if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); } #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { @@ -168,7 +189,8 @@ struct PairwiseDistances : public BaseClass { this->pageWr ^= 1; } - DI void loop() { + DI void loop() + { for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { this->ldgXY(kidx); accumulate(); // on the previous k-block @@ -185,7 +207,8 @@ struct PairwiseDistances : public BaseClass { this->pageRd ^= 1; } - DI void accumulate() { + DI void accumulate() + { #pragma unroll for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { this->ldsXY(ki); @@ -202,21 +225,22 @@ struct PairwiseDistances : public BaseClass { } } - DI void epilog(IdxT gridStrideX, IdxT gridStrideY) { + DI void epilog(IdxT gridStrideX, IdxT gridStrideY) + { if (useNorms) { - DataT *sxNorm = (DataT *)(&smem[P::SmemSize]); - DataT *syNorm = (&sxNorm[P::Mblk]); + DataT* sxNorm = (DataT*)(&smem[P::SmemSize]); + DataT* syNorm = (&sxNorm[P::Mblk]); // Load x & y norms required by this threadblock in shmem buffer if (gridStrideX == blockIdx.x * P::Nblk) { for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = gridStrideY + i; + auto idx = gridStrideY + i; sxNorm[i] = idx < this->m ? xn[idx] : 0; } } for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = gridStrideX + i; + auto idx = gridStrideX + i; syNorm[i] = idx < this->n ? yn[idx] : 0; } @@ -291,41 +315,68 @@ struct PairwiseDistances : public BaseClass { * @param fin_op the final gemm epilogue lambda */ -template +template __global__ __launch_bounds__(Policy::Nthreads, 2) - void pairwiseDistanceMatKernel(const DataT *x, const DataT *y, - const DataT *_xn, const DataT *_yn, IdxT m, - IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - OutT *dOutput, CoreLambda core_op, - EpilogueLambda epilog_op, FinalLambda fin_op) { + void pairwiseDistanceMatKernel(const DataT* x, + const DataT* y, + const DataT* _xn, + const DataT* _yn, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + CoreLambda core_op, + EpilogueLambda epilog_op, + FinalLambda fin_op) +{ extern __shared__ char smem[]; auto rowEpilog = [] __device__(IdxT starty) { return; }; - PairwiseDistances - obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, - epilog_op, fin_op, rowEpilog); + PairwiseDistances + obj( + x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog); obj.run(); } template -dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) { - const auto numSMs = raft::getMultiProcessorCount(); +dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) +{ + const auto numSMs = raft::getMultiProcessorCount(); int numBlocksPerSm = 0; dim3 grid; - CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocksPerSm, func, P::Nthreads, sMemSize)); + CUDA_CHECK( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize)); std::size_t minGridSize = numSMs * numBlocksPerSm; - std::size_t yChunks = raft::ceildiv(m, P::Mblk); - std::size_t xChunks = raft::ceildiv(n, P::Nblk); - grid.y = yChunks > minGridSize ? minGridSize : yChunks; - grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; + std::size_t yChunks = raft::ceildiv(m, P::Mblk); + std::size_t xChunks = raft::ceildiv(n, P::Nblk); + grid.y = yChunks > minGridSize ? minGridSize : yChunks; + grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; if (grid.x != 1) { std::size_t i = 1; while (grid.y * i < minGridSize) { diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh index 8e4c4824c3..d4fbb039e7 100644 --- a/cpp/include/raft/distance/detail/russell_rao.cuh +++ b/cpp/include/raft/distance/detail/russell_rao.cuh @@ -23,7 +23,7 @@ namespace detail { /** * @brief the Russell Rao distance matrix: - * It computes the following equation: + * It computes the following equation: Cij = (k - sum(x_i * y_i)) / k * * @tparam DataT input data-type (for A and B matrices) @@ -47,29 +47,42 @@ namespace detail { * @param[in] fin_op the final gemm epilogue lambda * @param[in] stream cuda stream to launch work */ -template -static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, - IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +static void russellRaoImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ typedef typename raft::linalg::Policy4x4::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; - typedef - typename std::conditional::type KPolicy; + typedef typename std::conditional::type KPolicy; dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; const float one_over_k = 1.0 / k; // epilogue operation lambda for final value calculation auto epilog_lambda = [k, one_over_k] __device__( AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { #pragma unroll for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) { @@ -81,46 +94,65 @@ static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, }; if (isRowMajor) { - constexpr auto russellRaoRowMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - russellRaoRowMajor); + constexpr auto russellRaoRowMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, russellRaoRowMajor); russellRaoRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } else { - constexpr auto russellRaoColMajor = - pairwiseDistanceMatKernel; - dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, - russellRaoColMajor); + constexpr auto russellRaoColMajor = pairwiseDistanceMatKernel; + dim3 grid = launchConfigGenerator(m, n, KPolicy::SmemSize, russellRaoColMajor); russellRaoColMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, - epilog_lambda, fin_op); + x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op); } CUDA_CHECK(cudaGetLastError()); } -template -void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, OutT *dOutput, - FinalLambda fin_op, cudaStream_t stream) { +template +void russellRao(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + OutT* dOutput, + FinalLambda fin_op, + cudaStream_t stream) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - russellRaoImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + russellRaoImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - russellRaoImpl(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, - stream); + russellRaoImpl( + x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); } else { russellRaoImpl( x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream); @@ -129,7 +161,7 @@ void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, /** * @brief the Russell Rao distance matrix calculation - * It computes the following equation: + * It computes the following equation: Cij = (k - sum(x_i * y_i)) / k * * @tparam InType input data-type (for A and B matrices) @@ -147,16 +179,25 @@ void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param stream cuda stream where to launch work * @param isRowMajor whether the input and output matrices are row major */ -template -void russellRaoImpl(int m, int n, int k, const InType *pA, const InType *pB, - OutType *pD, FinalLambda fin_op, cudaStream_t stream, - bool isRowMajor) { +template +void russellRaoImpl(int m, + int n, + int k, + const InType* pA, + const InType* pB, + OutType* pD, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor) +{ typedef std::is_same is_bool; - typedef typename std::conditional::type - russellRaoOutType; + typedef typename std::conditional::type russellRaoOutType; Index_ lda, ldb, ldd; - russellRaoOutType *pDcast = reinterpret_cast(pD); + russellRaoOutType* pDcast = reinterpret_cast(pD); if (isRowMajor) { lda = k, ldb = k, ldd = n; russellRao( diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp index 8b55543ff8..66832c12d2 100644 --- a/cpp/include/raft/distance/distance.hpp +++ b/cpp/include/raft/distance/distance.hpp @@ -25,132 +25,163 @@ namespace raft { namespace distance { /** -* @brief Evaluate pairwise distances with the user epilogue lamba allowed -* @tparam DistanceType which distance to evaluate -* @tparam InType input argument type -* @tparam AccType accumulation type -* @tparam OutType output type -* @tparam FinalLambda user-defined epilogue lamba -* @tparam Index_ Index type -* @param x first set of points -* @param y second set of points -* @param dist output distance matrix -* @param m number of points in x -* @param n number of points in y -* @param k dimensionality -* @param workspace temporary workspace needed for computations -* @param worksize number of bytes of the workspace -* @param fin_op the final gemm epilogue lambda -* @param stream cuda stream -* @param isRowMajor whether the matrices are row-major or col-major -* @param metric_arg metric argument (used for Minkowski distance) -* -* @note fin_op: This is a device lambda which is supposed to operate upon the -* input which is AccType and returns the output in OutType. It's signature is -* as follows:

OutType fin_op(AccType in, int g_idx);
. If one needs -* any other parameters, feel free to pass them via closure. -*/ -template OutType fin_op(AccType in, int g_idx);. If one needs + * any other parameters, feel free to pass them via closure. + */ +template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + FinalLambda fin_op, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ detail::distance( - x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, - metric_arg); + x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); } /** -* @brief Evaluate pairwise distances for the simple use case -* @tparam DistanceType which distance to evaluate -* @tparam InType input argument type -* @tparam AccType accumulation type -* @tparam OutType output type -* @tparam Index_ Index type -* @param x first set of points -* @param y second set of points -* @param dist output distance matrix -* @param m number of points in x -* @param n number of points in y -* @param k dimensionality -* @param workspace temporary workspace needed for computations -* @param worksize number of bytes of the workspace -* @param stream cuda stream -* @param isRowMajor whether the matrices are row-major or col-major -* @param metric_arg metric argument (used for Minkowski distance) -* -* @note if workspace is passed as nullptr, this will return in -* worksize, the number of bytes of workspace required -*/ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, void *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { + * @brief Evaluate pairwise distances for the simple use case + * @tparam DistanceType which distance to evaluate + * @tparam InType input argument type + * @tparam AccType accumulation type + * @tparam OutType output type + * @tparam Index_ Index type + * @param x first set of points + * @param y second set of points + * @param dist output distance matrix + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * @param workspace temporary workspace needed for computations + * @param worksize number of bytes of the workspace + * @param stream cuda stream + * @param isRowMajor whether the matrices are row-major or col-major + * @param metric_arg metric argument (used for Minkowski distance) + * + * @note if workspace is passed as nullptr, this will return in + * worksize, the number of bytes of workspace required + */ +template +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + void* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ detail::distance( x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg); } /** -* @brief Return the exact workspace size to compute the distance -* @tparam DistanceType which distance to evaluate -* @tparam InType input argument type -* @tparam AccType accumulation type -* @tparam OutType output type -* @tparam Index_ Index type -* @param x first set of points -* @param y second set of points -* @param m number of points in x -* @param n number of points in y -* @param k dimensionality -* -* @note If the specified distanceType doesn't need the workspace at all, it -* returns 0. -*/ -template -size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n, - Index_ k) { - return detail::getWorkspaceSize(x, y, m, n, k); + * @brief Return the exact workspace size to compute the distance + * @tparam DistanceType which distance to evaluate + * @tparam InType input argument type + * @tparam AccType accumulation type + * @tparam OutType output type + * @tparam Index_ Index type + * @param x first set of points + * @param y second set of points + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * + * @note If the specified distanceType doesn't need the workspace at all, it + * returns 0. + */ +template +size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k) +{ + return detail::getWorkspaceSize(x, y, m, n, k); } /** -* @brief Evaluate pairwise distances for the simple use case -* @tparam DistanceType which distance to evaluate -* @tparam InType input argument type -* @tparam AccType accumulation type -* @tparam OutType output type -* @tparam Index_ Index type -* @param x first set of points -* @param y second set of points -* @param dist output distance matrix -* @param m number of points in x -* @param n number of points in y -* @param k dimensionality -* @param stream cuda stream -* @param isRowMajor whether the matrices are row-major or col-major -* @param metric_arg metric argument (used for Minkowski distance) -* -* @note if workspace is passed as nullptr, this will return in -* worksize, the number of bytes of workspace required -*/ -template -void distance(const InType *x, const InType *y, OutType *dist, Index_ m, - Index_ n, Index_ k, cudaStream_t stream, bool isRowMajor = true, - InType metric_arg = 2.0f) { + * @brief Evaluate pairwise distances for the simple use case + * @tparam DistanceType which distance to evaluate + * @tparam InType input argument type + * @tparam AccType accumulation type + * @tparam OutType output type + * @tparam Index_ Index type + * @param x first set of points + * @param y second set of points + * @param dist output distance matrix + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * @param stream cuda stream + * @param isRowMajor whether the matrices are row-major or col-major + * @param metric_arg metric argument (used for Minkowski distance) + * + * @note if workspace is passed as nullptr, this will return in + * worksize, the number of bytes of workspace required + */ +template +void distance(const InType* x, + const InType* y, + OutType* dist, + Index_ m, + Index_ n, + Index_ k, + cudaStream_t stream, + bool isRowMajor = true, + InType metric_arg = 2.0f) +{ rmm::device_uvector workspace(0, stream); - auto worksize = - getWorkspaceSize(x, y, m, n, - k); + auto worksize = getWorkspaceSize(x, y, m, n, k); workspace.resize(worksize, stream); detail::distance( - x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, - metric_arg); + x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg); } /** @@ -173,119 +204,117 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m, * @param isRowMajor whether the matrices are row-major or col-major */ template -void pairwise_distance(const raft::handle_t &handle, const Type *x, - const Type *y, Type *dist, Index_ m, Index_ n, Index_ k, - rmm::device_uvector &workspace, +void pairwise_distance(const raft::handle_t& handle, + const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, + rmm::device_uvector& workspace, raft::distance::DistanceType metric, - bool isRowMajor = true, Type metric_arg = 2.0f) { + bool isRowMajor = true, + Type metric_arg = 2.0f) +{ switch (metric) { case raft::distance::DistanceType::L2Expanded: - detail::pairwise_distance_impl( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::L2SqrtExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::CosineExpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::L1: - detail::pairwise_distance_impl( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::L2Unexpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::L2Unexpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::Linf: - detail::pairwise_distance_impl( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::HellingerExpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::LpUnexpanded>( - x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, - metric_arg); + detail::pairwise_distance_impl( + x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg); break; case raft::distance::DistanceType::Canberra: - detail::pairwise_distance_impl( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::HammingUnexpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::HammingUnexpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::JensenShannon: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::JensenShannon>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::RusselRaoExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::KLDivergence: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::KLDivergence>( + detail::pairwise_distance_impl( x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; case raft::distance::DistanceType::CorrelationExpanded: - detail::pairwise_distance_impl< - Type, Index_, raft::distance::DistanceType::CorrelationExpanded>( - x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); + detail:: + pairwise_distance_impl( + x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor); break; - default: - THROW("Unknown or unsupported distance metric '%d'!", (int)metric); + default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); }; } /** @} */ /** - * @defgroup pairwise_distance pairwise distance prims - * @{ - * @brief Convenience wrapper around 'distance' prim to convert runtime metric - * into compile time for the purpose of dispatch - * @tparam Type input/accumulation/output data-type - * @tparam Index_ indexing type - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param metric distance metric - * @param stream cuda stream - * @param isRowMajor whether the matrices are row-major or col-major - */ + * @defgroup pairwise_distance pairwise distance prims + * @{ + * @brief Convenience wrapper around 'distance' prim to convert runtime metric + * into compile time for the purpose of dispatch + * @tparam Type input/accumulation/output data-type + * @tparam Index_ indexing type + * @param x first set of points + * @param y second set of points + * @param dist output distance matrix + * @param m number of points in x + * @param n number of points in y + * @param k dimensionality + * @param metric distance metric + * @param stream cuda stream + * @param isRowMajor whether the matrices are row-major or col-major + */ template -void pairwise_distance(const raft::handle_t &handle, const Type *x, - const Type *y, Type *dist, Index_ m, Index_ n, Index_ k, +void pairwise_distance(const raft::handle_t& handle, + const Type* x, + const Type* y, + Type* dist, + Index_ m, + Index_ n, + Index_ k, raft::distance::DistanceType metric, - bool isRowMajor = true, Type metric_arg = 2.0f) { + bool isRowMajor = true, + Type metric_arg = 2.0f) +{ rmm::device_uvector workspace(0, handle.get_stream()); - pairwise_distance(handle, x, y, dist, m, n, k, workspace, - metric, isRowMajor, metric_arg); + pairwise_distance( + handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg); } }; // namespace distance diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp index 0a730506c8..d924ef217c 100644 --- a/cpp/include/raft/distance/fused_l2_nn.hpp +++ b/cpp/include/raft/distance/fused_l2_nn.hpp @@ -30,8 +30,7 @@ template using KVPMinReduce = detail::KVPMinReduceImpl; template -using MinAndDistanceReduceOp = - detail::MinAndDistanceReduceOpImpl; +using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl; template using MinReduceOp = detail::MinReduceOpImpl; @@ -40,10 +39,9 @@ using MinReduceOp = detail::MinReduceOpImpl; * Initialize array using init value from reduction op */ template -void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, - ReduceOpT redOp) { - detail::initialize(min, m, maxVal, redOp, - handle.get_stream()); +void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) +{ + detail::initialize(min, m, maxVal, redOp, handle.get_stream()); } /** @@ -82,25 +80,32 @@ void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, * main kernel launch * @param[in] stream cuda stream */ -template -void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn, - const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace, - ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt, - bool initOutBuffer, cudaStream_t stream) { +template +void fusedL2NN(OutT* min, + const DataT* x, + const DataT* y, + const DataT* xn, + const DataT* yn, + IdxT m, + IdxT n, + IdxT k, + void* workspace, + ReduceOpT redOp, + KVPReduceOpT pairRedOp, + bool sqrt, + bool initOutBuffer, + cudaStream_t stream) +{ size_t bytes = sizeof(DataT) * k; if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } else { detail::fusedL2NNImpl( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, - initOutBuffer, stream); + min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); } } diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp index c62f2e5f79..773b83ab13 100644 --- a/cpp/include/raft/error.hpp +++ b/cpp/include/raft/error.hpp @@ -31,14 +31,14 @@ class exception : public std::exception { explicit exception() noexcept : std::exception(), msg_() {} /** copy ctor */ - exception(exception const& src) noexcept - : std::exception(), msg_(src.what()) { + exception(exception const& src) noexcept : std::exception(), msg_(src.what()) + { collect_call_stack(); } /** ctor from an input message */ - explicit exception(std::string const msg) noexcept - : std::exception(), msg_(std::move(msg)) { + explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg)) + { collect_call_stack(); } @@ -51,7 +51,8 @@ class exception : public std::exception { /** append call stack info to this exception's message for ease of debug */ // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html - void collect_call_stack() noexcept { + void collect_call_stack() noexcept + { #ifdef __GNUC__ constexpr int kMaxStackDepth = 64; void* stack[kMaxStackDepth]; // NOLINT @@ -90,16 +91,16 @@ struct logic_error : public raft::exception { // FIXME: Need to be replaced with RAFT_FAIL /** macro to throw a runtime error */ -#define THROW(fmt, ...) \ - do { \ - std::string msg; \ - char errMsg[2048]; /* NOLINT */ \ - std::snprintf(errMsg, sizeof(errMsg), \ - "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ - msg += errMsg; \ - std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ - msg += errMsg; \ - throw raft::exception(msg); \ +#define THROW(fmt, ...) \ + do { \ + std::string msg; \ + char errMsg[2048]; /* NOLINT */ \ + std::snprintf( \ + errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \ + msg += errMsg; \ + std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__); \ + msg += errMsg; \ + throw raft::exception(msg); \ } while (0) // FIXME: Need to be replaced with RAFT_EXPECTS @@ -109,16 +110,15 @@ struct logic_error : public raft::exception { if (!(check)) THROW(fmt, ##__VA_ARGS__); \ } while (0) -#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ - do { \ - char err_msg[2048]; /* NOLINT */ \ - std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \ - __LINE__); \ - msg += err_msg; \ - std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ - msg += err_msg; \ +#define SET_ERROR_MSG(msg, location_prefix, fmt, ...) \ + do { \ + char err_msg[2048]; /* NOLINT */ \ + std::snprintf(err_msg, sizeof(err_msg), location_prefix); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \ + msg += err_msg; \ + std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__); \ + msg += err_msg; \ } while (0) /** diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp index 794951ca9c..70fff1e210 100644 --- a/cpp/include/raft/handle.hpp +++ b/cpp/include/raft/handle.hpp @@ -61,34 +61,30 @@ class handle_t { int cur_dev = -1; CUDA_CHECK(cudaGetDevice(&cur_dev)); return cur_dev; - }()) { - if (n_streams != 0) { - streams_ = std::make_unique(n_streams); - } + }()) + { + if (n_streams != 0) { streams_ = std::make_unique(n_streams); } create_resources(); thrust_policy_ = std::make_unique(user_stream_); } /** - * @brief Construct a light handle copy from another + * @brief Construct a light handle copy from another * user stream, cuda handles, comms and worker pool are not copied - * The user_stream of the returned handle is set to the specified stream + * The user_stream of the returned handle is set to the specified stream * of the other handle worker pool * @param[in] other other handle for which to use streams - * @param[in] stream_id stream id in `other` worker streams + * @param[in] stream_id stream id in `other` worker streams * to be set as user stream in the constructed handle * @param[in] n_streams number worker streams to be created */ - handle_t(const handle_t& other, int stream_id, - int n_streams = kNumDefaultWorkerStreams) - : dev_id_(other.get_device()) { - RAFT_EXPECTS( - other.get_num_internal_streams() > 0, - "ERROR: the main handle must have at least one worker stream\n"); - if (n_streams != 0) { - streams_ = std::make_unique(n_streams); - } - prop_ = other.get_device_properties(); + handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams) + : dev_id_(other.get_device()) + { + RAFT_EXPECTS(other.get_num_internal_streams() > 0, + "ERROR: the main handle must have at least one worker stream\n"); + if (n_streams != 0) { streams_ = std::make_unique(n_streams); } + prop_ = other.get_device_properties(); device_prop_initialized_ = true; create_resources(); set_stream(other.get_internal_stream(stream_id)); @@ -102,11 +98,10 @@ class handle_t { void set_stream(cudaStream_t stream) { user_stream_ = stream; } cudaStream_t get_stream() const { return user_stream_; } - rmm::cuda_stream_view get_stream_view() const { - return rmm::cuda_stream_view(user_stream_); - } + rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); } - cublasHandle_t get_cublas_handle() const { + cublasHandle_t get_cublas_handle() const + { std::lock_guard _(mutex_); if (!cublas_initialized_) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); @@ -115,7 +110,8 @@ class handle_t { return cublas_handle_; } - cusolverDnHandle_t get_cusolver_dn_handle() const { + cusolverDnHandle_t get_cusolver_dn_handle() const + { std::lock_guard _(mutex_); if (!cusolver_dn_initialized_) { CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_)); @@ -124,7 +120,8 @@ class handle_t { return cusolver_dn_handle_; } - cusolverSpHandle_t get_cusolver_sp_handle() const { + cusolverSpHandle_t get_cusolver_sp_handle() const + { std::lock_guard _(mutex_); if (!cusolver_sp_initialized_) { CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_)); @@ -133,7 +130,8 @@ class handle_t { return cusolver_sp_handle_; } - cusparseHandle_t get_cusparse_handle() const { + cusparseHandle_t get_cusparse_handle() const + { std::lock_guard _(mutex_); if (!cusparse_initialized_) { CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); @@ -145,25 +143,27 @@ class handle_t { rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; } // legacy compatibility for cuML - cudaStream_t get_internal_stream(int sid) const { - RAFT_EXPECTS( - streams_.get() != nullptr, - "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); + cudaStream_t get_internal_stream(int sid) const + { + RAFT_EXPECTS(streams_.get() != nullptr, + "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); return streams_->get_stream(sid).value(); } // new accessor return rmm::cuda_stream_view - rmm::cuda_stream_view get_internal_stream_view(int sid) const { - RAFT_EXPECTS( - streams_.get() != nullptr, - "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); + rmm::cuda_stream_view get_internal_stream_view(int sid) const + { + RAFT_EXPECTS(streams_.get() != nullptr, + "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value"); return streams_->get_stream(sid); } - int get_num_internal_streams() const { + int get_num_internal_streams() const + { return streams_.get() != nullptr ? streams_->get_pool_size() : 0; } - std::vector get_internal_streams() const { + std::vector get_internal_streams() const + { std::vector int_streams_vec; for (int i = 0; i < get_num_internal_streams(); i++) { int_streams_vec.push_back(get_internal_stream(i)); @@ -171,49 +171,51 @@ class handle_t { return int_streams_vec; } - void wait_on_user_stream() const { + void wait_on_user_stream() const + { CUDA_CHECK(cudaEventRecord(event_, user_stream_)); for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0)); } } - void wait_on_internal_streams() const { + void wait_on_internal_streams() const + { for (int i = 0; i < get_num_internal_streams(); i++) { CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i))); CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0)); } } - void set_comms(std::shared_ptr communicator) { - communicator_ = communicator; - } + void set_comms(std::shared_ptr communicator) { communicator_ = communicator; } - const comms::comms_t& get_comms() const { - RAFT_EXPECTS(this->comms_initialized(), - "ERROR: Communicator was not initialized\n"); + const comms::comms_t& get_comms() const + { + RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n"); return *communicator_; } - void set_subcomm(std::string key, std::shared_ptr subcomm) { + void set_subcomm(std::string key, std::shared_ptr subcomm) + { subcomms_[key] = subcomm; } - const comms::comms_t& get_subcomm(std::string key) const { - RAFT_EXPECTS(subcomms_.find(key) != subcomms_.end(), - "%s was not found in subcommunicators.", key.c_str()); + const comms::comms_t& get_subcomm(std::string key) const + { + RAFT_EXPECTS( + subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str()); auto subcomm = subcomms_.at(key); - RAFT_EXPECTS(nullptr != subcomm.get(), - "ERROR: Subcommunicator was not initialized"); + RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized"); return *subcomm; } bool comms_initialized() const { return (nullptr != communicator_.get()); } - const cudaDeviceProp& get_device_properties() const { + const cudaDeviceProp& get_device_properties() const + { std::lock_guard _(mutex_); if (!device_prop_initialized_) { CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_)); @@ -243,29 +245,28 @@ class handle_t { mutable bool device_prop_initialized_{false}; mutable std::mutex mutex_; - void create_resources() { - CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } + void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); } - void destroy_resources() { + void destroy_resources() + { ///@todo: enable *_NO_THROW variants once we have enabled logging if (cusparse_initialized_) { - //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); + // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_)); CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); } if (cusolver_dn_initialized_) { - //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); + // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_)); CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_)); } if (cusolver_sp_initialized_) { - //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); + // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_)); CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_)); } if (cublas_initialized_) { - //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); + // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_)); CUBLAS_CHECK(cublasDestroy(cublas_handle_)); } - //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); + // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_)); CUDA_CHECK(cudaEventDestroy(event_)); } }; // class handle_t @@ -275,7 +276,8 @@ class handle_t { */ class stream_syncer { public: - explicit stream_syncer(const handle_t& handle) : handle_(handle) { + explicit stream_syncer(const handle_t& handle) : handle_(handle) + { handle_.wait_on_user_stream(); } ~stream_syncer() { handle_.wait_on_internal_streams(); } diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h index a7cfb9287b..5fc56de14b 100644 --- a/cpp/include/raft/integer_utils.h +++ b/cpp/include/raft/integer_utils.h @@ -34,15 +34,13 @@ namespace raft { * `modulus` is positive. */ template -inline S round_up_safe(S number_to_round, S modulus) { +inline S round_up_safe(S number_to_round, S modulus) +{ auto remainder = number_to_round % modulus; - if (remainder == 0) { - return number_to_round; - } + if (remainder == 0) { return number_to_round; } auto rounded_up = number_to_round - remainder + modulus; if (rounded_up < number_to_round) { - throw std::invalid_argument( - "Attempt to round up beyond the type's maximum value"); + throw std::invalid_argument("Attempt to round up beyond the type's maximum value"); } return rounded_up; } @@ -53,8 +51,9 @@ inline S round_up_safe(S number_to_round, S modulus) { * `modulus` is positive. */ template -inline S round_down_safe(S number_to_round, S modulus) { - auto remainder = number_to_round % modulus; +inline S round_down_safe(S number_to_round, S modulus) +{ + auto remainder = number_to_round % modulus; auto rounded_down = number_to_round - remainder; return rounded_down; } @@ -72,25 +71,28 @@ inline S round_down_safe(S number_to_round, S modulus) { * the result will be incorrect */ template -constexpr inline S div_rounding_up_unsafe(const S& dividend, - const T& divisor) noexcept { +constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept +{ return (dividend + divisor - 1) / divisor; } namespace detail { template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, I divisor) noexcept { + I dividend, + I divisor) noexcept +{ // TODO: This could probably be implemented faster - return (dividend > divisor) - ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) - : (dividend > 0); + return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) + : (dividend > 0); } template constexpr inline I div_rounding_up_safe(std::integral_constant, - I dividend, I divisor) noexcept { - auto quotient = dividend / divisor; + I dividend, + I divisor) noexcept +{ + auto quotient = dividend / divisor; auto remainder = dividend % divisor; return quotient + (remainder != 0); } @@ -110,16 +112,17 @@ constexpr inline I div_rounding_up_safe(std::integral_constant, * approach of using (dividend + divisor - 1) / divisor */ template -constexpr inline std::enable_if_t::value, I> -div_rounding_up_safe(I dividend, I divisor) noexcept { - using i_is_a_signed_type = - std::integral_constant::value>; +constexpr inline std::enable_if_t::value, I> div_rounding_up_safe( + I dividend, I divisor) noexcept +{ + using i_is_a_signed_type = std::integral_constant::value>; return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); } template -constexpr inline std::enable_if_t::value, bool> -is_a_power_of_two(I val) noexcept { +constexpr inline std::enable_if_t::value, bool> is_a_power_of_two( + I val) noexcept +{ return ((val - 1) & val) == 0; } @@ -147,14 +150,14 @@ is_a_power_of_two(I val) noexcept { * @return Absolute value if value type is signed. */ template -std::enable_if_t::value, T> constexpr inline absolute_value( - T value) { +std::enable_if_t::value, T> constexpr inline absolute_value(T value) +{ return std::abs(value); } // Unsigned type just returns itself. template -std::enable_if_t::value, T> constexpr inline absolute_value( - T value) { +std::enable_if_t::value, T> constexpr inline absolute_value(T value) +{ return value; } diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh index b2302836bc..a2e29952d7 100644 --- a/cpp/include/raft/label/classlabels.cuh +++ b/cpp/include/raft/label/classlabels.cuh @@ -42,26 +42,25 @@ namespace label { * \param [in] stream cuda stream */ template -int getUniquelabels(rmm::device_uvector &unique, value_t *y, size_t n, - cudaStream_t stream) { +int getUniquelabels(rmm::device_uvector& unique, value_t* y, size_t n, cudaStream_t stream) +{ rmm::device_scalar d_num_selected(stream); rmm::device_uvector workspace(n, stream); - size_t bytes = 0; + size_t bytes = 0; size_t bytes2 = 0; // Query how much temporary storage we will need for cub operations // and allocate it cub::DeviceRadixSort::SortKeys(NULL, bytes, y, workspace.data(), n); - cub::DeviceSelect::Unique(NULL, bytes2, workspace.data(), workspace.data(), - d_num_selected.data(), n); + cub::DeviceSelect::Unique( + NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n); bytes = max(bytes, bytes2); rmm::device_uvector cub_storage(bytes, stream); // Select Unique classes - cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(), - n); - cub::DeviceSelect::Unique(cub_storage.data(), bytes, workspace.data(), - workspace.data(), d_num_selected.data(), n); + cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(), n); + cub::DeviceSelect::Unique( + cub_storage.data(), bytes, workspace.data(), workspace.data(), d_num_selected.data(), n); int n_unique = d_num_selected.value(stream); // Copy unique classes to output @@ -90,16 +89,17 @@ int getUniquelabels(rmm::device_uvector &unique, value_t *y, size_t n, * \param [in] stream cuda stream */ template -void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes, - value_t *y_out, int idx, cudaStream_t stream) { +void getOvrlabels( + value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream) +{ ASSERT(idx < n_classes, "Parameter idx should not be larger than the number " "of classes"); raft::linalg::unaryOp( - y_out, y, n, - [idx, y_unique] __device__(value_t y) { - return y == y_unique[idx] ? +1 : -1; - }, + y_out, + y, + n, + [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; }, stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -108,9 +108,14 @@ void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes, // +/-1, return array with the new class labels and corresponding indices. template -__global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, - Type *out, size_t N, Lambda filter_op, - bool zero_based = false) { +__global__ void map_label_kernel(Type* map_ids, + size_t N_labels, + Type* in, + Type* out, + size_t N, + Lambda filter_op, + bool zero_based = false) +{ int tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (!filter_op(in[tid])) { @@ -125,27 +130,28 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in, } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out the output monotonic array - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - * @param filter_op an optional function for specifying which values - * should have monotonically increasing labels applied to them. - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out the output monotonic array + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + * @param filter_op an optional function for specifying which values + * should have monotonically increasing labels applied to them. + */ template -void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - Lambda filter_op, bool zero_based = false) { +void make_monotonic( + Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false) +{ static const size_t TPB_X = 256; dim3 blocks(raft::ceildiv(N, TPB_X)); @@ -159,25 +165,25 @@ void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, } /** - * Maps an input array containing a series of numbers into a new array - * where numbers have been mapped to a monotonically increasing set - * of labels. This can be useful in machine learning algorithms, for instance, - * where a given set of labels is not taken from a monotonically increasing - * set. This can happen if they are filtered or if only a subset of the - * total labels are used in a dataset. This is also useful in graph algorithms - * where a set of vertices need to be labeled in a monotonically increasing - * order. - * @tparam Type the numeric type of the input and output arrays - * @tparam Lambda the type of an optional filter function, which determines - * which items in the array to map. - * @param out output label array with labels assigned monotonically - * @param in input label array - * @param N number of elements in the input array - * @param stream cuda stream to use - */ + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param out output label array with labels assigned monotonically + * @param in input label array + * @param N number of elements in the input array + * @param stream cuda stream to use + */ template -void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream, - bool zero_based = false) { +void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false) +{ make_monotonic( out, in, N, stream, [] __device__(Type val) { return false; }, zero_based); } diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh index bed74581a2..1ee0659b0d 100644 --- a/cpp/include/raft/label/merge_labels.cuh +++ b/cpp/include/raft/label/merge_labels.cuh @@ -35,8 +35,10 @@ __global__ void __launch_bounds__(TPB_X) propagate_label_kernel(const value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, value_idx* __restrict__ R, - const bool* __restrict__ mask, bool* __restrict__ m, - value_idx N) { + const bool* __restrict__ mask, + bool* __restrict__ m, + value_idx N) +{ value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (__ldg((char*)mask + tid)) { @@ -65,15 +67,17 @@ template __global__ void __launch_bounds__(TPB_X) reassign_label_kernel(value_idx* __restrict__ labels_a, const value_idx* __restrict__ labels_b, - const value_idx* __restrict__ R, value_idx N, - value_idx MAX_LABEL) { + const value_idx* __restrict__ R, + value_idx N, + value_idx MAX_LABEL) +{ value_idx tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { // Note: labels are from 1 to N - value_idx la = labels_a[tid]; - value_idx lb = __ldg(labels_b + tid); - value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; - value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; + value_idx la = labels_a[tid]; + value_idx lb = __ldg(labels_b + tid); + value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1; + value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1; labels_a[tid] = min(ra, rb); } } @@ -108,9 +112,14 @@ __global__ void __launch_bounds__(TPB_X) * @param[in] stream CUDA stream */ template -void merge_labels(value_idx* labels_a, const value_idx* labels_b, - const bool* mask, value_idx* R, bool* m, value_idx N, - cudaStream_t stream) { +void merge_labels(value_idx* labels_a, + const value_idx* labels_b, + const bool* mask, + value_idx* R, + bool* m, + value_idx N, + cudaStream_t stream) +{ dim3 blocks(raft::ceildiv(N, value_idx(TPB_X))); dim3 threads(TPB_X); value_idx MAX_LABEL = std::numeric_limits::max(); diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/d_structs.h index ed545b7198..e488dc528f 100644 --- a/cpp/include/raft/lap/d_structs.h +++ b/cpp/include/raft/lap/d_structs.h @@ -26,18 +26,18 @@ template struct Vertices { - vertex_t *row_assignments; - vertex_t *col_assignments; - int *row_covers; - int *col_covers; - weight_t *row_duals; - weight_t *col_duals; - weight_t *col_slacks; + vertex_t* row_assignments; + vertex_t* col_assignments; + int* row_covers; + int* col_covers; + weight_t* row_duals; + weight_t* col_duals; + weight_t* col_slacks; }; template struct VertexData { - vertex_t *parents; - vertex_t *children; - int *is_visited; + vertex_t* parents; + vertex_t* children; + int* is_visited; }; diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh index f64afb3549..42b898ebff 100644 --- a/cpp/include/raft/lap/lap.cuh +++ b/cpp/include/raft/lap/lap.cuh @@ -39,12 +39,12 @@ class LinearAssignmentProblem { vertex_t batchsize_; weight_t epsilon_; - weight_t const *d_costs_; + weight_t const* d_costs_; Vertices d_vertices_dev; VertexData d_row_data_dev, d_col_data_dev; - raft::handle_t const &handle_; + raft::handle_t const& handle_; rmm::device_uvector row_covers_v; rmm::device_uvector col_covers_v; rmm::device_uvector row_duals_v; @@ -60,8 +60,10 @@ class LinearAssignmentProblem { rmm::device_uvector obj_val_dual_v; public: - LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size, - vertex_t batchsize, weight_t epsilon) + LinearAssignmentProblem(raft::handle_t const& handle, + vertex_t size, + vertex_t batchsize, + weight_t epsilon) : handle_(handle), size_(size), batchsize_(batchsize), @@ -79,11 +81,13 @@ class LinearAssignmentProblem { row_children_v(0, handle_.get_stream()), col_children_v(0, handle_.get_stream()), obj_val_primal_v(0, handle_.get_stream()), - obj_val_dual_v(0, handle_.get_stream()) {} + obj_val_dual_v(0, handle_.get_stream()) + { + } // Executes Hungarian algorithm on the input cost matrix. - void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment, - vertex_t *d_col_assignment) { + void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment) + { initializeDevice(); d_vertices_dev.row_assignments = d_row_assignment; @@ -95,27 +99,13 @@ class LinearAssignmentProblem { while (step != 100) { switch (step) { - case 0: - step = hungarianStep0(); - break; - case 1: - step = hungarianStep1(); - break; - case 2: - step = hungarianStep2(); - break; - case 3: - step = hungarianStep3(); - break; - case 4: - step = hungarianStep4(); - break; - case 5: - step = hungarianStep5(); - break; - case 6: - step = hungarianStep6(); - break; + case 0: step = hungarianStep0(); break; + case 1: step = hungarianStep1(); break; + case 2: step = hungarianStep2(); break; + case 3: step = hungarianStep3(); break; + case 4: step = hungarianStep4(); break; + case 5: step = hungarianStep5(); break; + case 6: step = hungarianStep6(); break; } } @@ -123,36 +113,39 @@ class LinearAssignmentProblem { } // Function for getting optimal row dual vector for subproblem spId. - std::pair getRowDualVector(int spId) const { + std::pair getRowDualVector(int spId) const + { return std::make_pair(row_duals_v.data() + spId * size_, size_); } // Function for getting optimal col dual vector for subproblem spId. - std::pair getColDualVector(int spId) { + std::pair getColDualVector(int spId) + { return std::make_pair(col_duals_v.data() + spId * size_, size_); } // Function for getting optimal primal objective value for subproblem spId. - weight_t getPrimalObjectiveValue(int spId) { + weight_t getPrimalObjectiveValue(int spId) + { weight_t result; - raft::update_host(&result, obj_val_primal_v.data() + spId, 1, - handle_.get_stream()); + raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } // Function for getting optimal dual objective value for subproblem spId. - weight_t getDualObjectiveValue(int spId) { + weight_t getDualObjectiveValue(int spId) + { weight_t result; - raft::update_host(&result, obj_val_dual_v.data() + spId, 1, - handle_.get_stream()); + raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream()); CHECK_CUDA(handle_.get_stream()); return result; } private: // Helper function for initializing global variables and arrays on a single host. - void initializeDevice() { + void initializeDevice() + { cudaStream_t stream = handle_.get_stream(); row_covers_v.resize(batchsize_ * size_, stream); col_covers_v.resize(batchsize_ * size_, stream); @@ -171,39 +164,36 @@ class LinearAssignmentProblem { d_vertices_dev.row_covers = row_covers_v.data(); d_vertices_dev.col_covers = col_covers_v.data(); - d_vertices_dev.row_duals = row_duals_v.data(); - d_vertices_dev.col_duals = col_duals_v.data(); + d_vertices_dev.row_duals = row_duals_v.data(); + d_vertices_dev.col_duals = col_duals_v.data(); d_vertices_dev.col_slacks = col_slacks_v.data(); d_row_data_dev.is_visited = row_is_visited_v.data(); d_col_data_dev.is_visited = col_is_visited_v.data(); - d_row_data_dev.parents = row_parents_v.data(); - d_row_data_dev.children = row_children_v.data(); - d_col_data_dev.parents = col_parents_v.data(); - d_col_data_dev.children = col_children_v.data(); - - thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), - int{0}); - thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), - int{0}); - thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), - weight_t{0}); - thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), - weight_t{0}); + d_row_data_dev.parents = row_parents_v.data(); + d_row_data_dev.children = row_children_v.data(); + d_col_data_dev.parents = col_parents_v.data(); + d_col_data_dev.children = col_children_v.data(); + + thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0}); + thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0}); + thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0}); + thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0}); } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep0() { - detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, - size_); + int hungarianStep0() + { + detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_); return 1; } // Function for calculating initial zeros by subtracting row and column minima from each element. - int hungarianStep1() { - detail::computeInitialAssignments(handle_, d_costs_, d_vertices_dev, - batchsize_, size_, epsilon_); + int hungarianStep1() + { + detail::computeInitialAssignments( + handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_); int next = 2; @@ -219,10 +209,10 @@ class LinearAssignmentProblem { } // Function for checking optimality and constructing predicates and covers. - int hungarianStep2() { - int cover_count = - detail::computeRowCovers(handle_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, batchsize_, size_); + int hungarianStep2() + { + int cover_count = detail::computeRowCovers( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); int next = (cover_count == batchsize_ * size_) ? 6 : 3; @@ -230,7 +220,8 @@ class LinearAssignmentProblem { } // Function for building alternating tree rooted at unassigned rows. - int hungarianStep3() { + int hungarianStep3() + { int next; rmm::device_scalar flag_v(handle_.get_stream()); @@ -238,8 +229,14 @@ class LinearAssignmentProblem { bool h_flag = false; flag_v.set_value_async(h_flag, handle_.get_stream()); - detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, flag_v.data(), batchsize_, size_, + detail::executeZeroCover(handle_, + d_costs_, + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + flag_v.data(), + batchsize_, + size_, epsilon_); h_flag = flag_v.value(handle_.get_stream()); @@ -250,31 +247,36 @@ class LinearAssignmentProblem { } // Function for augmenting the solution along multiple node-disjoint alternating trees. - int hungarianStep4() { - detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, - size_); + int hungarianStep4() + { + detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_); - detail::augmentationPass(handle_, d_vertices_dev, d_row_data_dev, - d_col_data_dev, batchsize_, size_); + detail::augmentationPass( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_); return 2; } // Function for updating dual solution to introduce new zero-cost arcs. - int hungarianStep5() { - detail::dualUpdate(handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, - batchsize_, size_, epsilon_); + int hungarianStep5() + { + detail::dualUpdate( + handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_); return 3; } // Function for calculating primal and dual objective values at optimality. - int hungarianStep6() { - detail::calcObjValPrimal(handle_, obj_val_primal_v.data(), d_costs_, - d_vertices_dev.row_assignments, batchsize_, size_); + int hungarianStep6() + { + detail::calcObjValPrimal(handle_, + obj_val_primal_v.data(), + d_costs_, + d_vertices_dev.row_assignments, + batchsize_, + size_); - detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, - batchsize_, size_); + detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_); return 100; } diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh index 830940f0ec..ab4aa2df59 100644 --- a/cpp/include/raft/lap/lap_functions.cuh +++ b/cpp/include/raft/lap/lap_functions.cuh @@ -45,20 +45,26 @@ const int BLOCKDIMX{64}; const int BLOCKDIMY{1}; // Function for calculating grid and block dimensions from the given input size. -inline void calculateLinearDims(dim3 &blocks_per_grid, dim3 &threads_per_block, - int &total_blocks, int size) { +inline void calculateLinearDims(dim3& blocks_per_grid, + dim3& threads_per_block, + int& total_blocks, + int size) +{ threads_per_block.x = BLOCKDIMX * BLOCKDIMY; int value = size / threads_per_block.x; if (size % threads_per_block.x > 0) value++; - total_blocks = value; + total_blocks = value; blocks_per_grid.x = value; } // Function for calculating grid and block dimensions from the given input size for square grid. -inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block, - int &total_blocks, int size) { +inline void calculateSquareDims(dim3& blocks_per_grid, + dim3& threads_per_block, + int& total_blocks, + int size) +{ threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -67,15 +73,16 @@ inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block, int valuex = (int)ceil((float)(sq_size) / BLOCKDIMX); int valuey = (int)ceil((float)(sq_size) / BLOCKDIMY); - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } -// Function for calculating grid and block dimensions from the given input size for rectangular grid. -inline void calculateRectangularDims(dim3 &blocks_per_grid, - dim3 &threads_per_block, int &total_blocks, - int xsize, int ysize) { +// Function for calculating grid and block dimensions from the given input size for rectangular +// grid. +inline void calculateRectangularDims( + dim3& blocks_per_grid, dim3& threads_per_block, int& total_blocks, int xsize, int ysize) +{ threads_per_block.x = BLOCKDIMX; threads_per_block.y = BLOCKDIMY; @@ -85,16 +92,18 @@ inline void calculateRectangularDims(dim3 &blocks_per_grid, int valuey = ysize / threads_per_block.y; if (ysize % threads_per_block.y > 0) valuey++; - total_blocks = valuex * valuey; + total_blocks = valuex * valuey; blocks_per_grid.x = valuex; blocks_per_grid.y = valuey; } template -inline void initialReduction(raft::handle_t const &handle, - weight_t const *d_costs, - Vertices &d_vertices_dev, - int SP, vertex_t N) { +inline void initialReduction(raft::handle_t const& handle, + weight_t const* d_costs, + Vertices& d_vertices_dev, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -102,24 +111,28 @@ inline void initialReduction(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_rowReduction<<>>( - d_costs, d_vertices_dev.row_duals, SP, N, - std::numeric_limits::max()); + kernel_rowReduction<<>>( + d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); - kernel_columnReduction<<>>( - d_costs, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N, + kernel_columnReduction<<>>( + d_costs, + d_vertices_dev.row_duals, + d_vertices_dev.col_duals, + SP, + N, std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); } template -inline void computeInitialAssignments(raft::handle_t const &handle, - weight_t const *d_costs, - Vertices &d_vertices, - int SP, vertex_t N, weight_t epsilon) { +inline void computeInitialAssignments(raft::handle_t const& handle, + weight_t const* d_costs, + Vertices& d_vertices, + int SP, + vertex_t N, + weight_t epsilon) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -137,21 +150,29 @@ inline void computeInitialAssignments(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeInitialAssignments<<>>( - d_costs, d_vertices.row_duals, d_vertices.col_duals, - d_vertices.row_assignments, d_vertices.col_assignments, row_lock_v.data(), - col_lock_v.data(), SP, N, epsilon); + kernel_computeInitialAssignments<<>>( + d_costs, + d_vertices.row_duals, + d_vertices.col_duals, + d_vertices.row_assignments, + d_vertices.col_assignments, + row_lock_v.data(), + col_lock_v.data(), + SP, + N, + epsilon); CHECK_CUDA(handle.get_stream()); } // Function for finding row cover on individual devices. template -inline int computeRowCovers(raft::handle_t const &handle, - Vertices &d_vertices, - VertexData &d_row_data, - VertexData &d_col_data, int SP, - vertex_t N) { +inline int computeRowCovers(raft::handle_t const& handle, + Vertices& d_vertices, + VertexData& d_row_data, + VertexData& d_col_data, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; @@ -160,8 +181,7 @@ inline int computeRowCovers(raft::handle_t const &handle, thrust::fill_n(thrust::device, d_vertices.row_covers, size, int{0}); thrust::fill_n(thrust::device, d_vertices.col_covers, size, int{0}); - thrust::fill_n(thrust::device, d_vertices.col_slacks, size, - std::numeric_limits::max()); + thrust::fill_n(thrust::device, d_vertices.col_slacks, size, std::numeric_limits::max()); thrust::fill_n(thrust::device, d_row_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_col_data.is_visited, size, DORMANT); thrust::fill_n(thrust::device, d_row_data.parents, size, vertex_t{-1}); @@ -171,25 +191,28 @@ inline int computeRowCovers(raft::handle_t const &handle, raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_computeRowCovers<<>>( - d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, - SP, N); + kernel_computeRowCovers<<>>( + d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N); CHECK_CUDA(handle.get_stream()); - return thrust::reduce(thrust::device, d_vertices.row_covers, - d_vertices.row_covers + size); + return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size); } // Function for covering the zeros in uncovered rows and expanding the frontier. template -inline void coverZeroAndExpand( - raft::handle_t const &handle, weight_t const *d_costs_dev, - vertex_t const *d_rows_csr_neighbors, vertex_t const *d_rows_csr_ptrs, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, VertexData &d_col_data_dev, - bool *d_flag, int SP, vertex_t N, weight_t epsilon) { +inline void coverZeroAndExpand(raft::handle_t const& handle, + weight_t const* d_costs_dev, + vertex_t const* d_rows_csr_neighbors, + vertex_t const* d_rows_csr_ptrs, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; @@ -197,20 +220,30 @@ inline void coverZeroAndExpand( raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_coverAndExpand<<>>( - d_flag, d_rows_csr_ptrs, d_rows_csr_neighbors, d_costs_dev, d_vertices_dev, - d_row_data_dev, d_col_data_dev, SP, N, epsilon); + kernel_coverAndExpand<<>>( + d_flag, + d_rows_csr_ptrs, + d_rows_csr_neighbors, + d_costs_dev, + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + SP, + N, + epsilon); } template -inline vertex_t zeroCoverIteration(raft::handle_t const &handle, - weight_t const *d_costs_dev, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, - bool *d_flag, int SP, vertex_t N, - weight_t epsilon) { +inline vertex_t zeroCoverIteration(raft::handle_t const& handle, + weight_t const* d_costs_dev, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ vertex_t M; rmm::device_uvector csr_ptrs_v(0, handle.get_stream()); @@ -235,65 +268,85 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle, blocks_per_grid, threads_per_block, total_blocks, N, SP); // construct predicate matrix for edges. - kernel_rowPredicateConstructionCSR<<>>( - predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, - N); + predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N); CHECK_CUDA(handle.get_stream()); M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); - thrust::exclusive_scan(thrust::device, addresses_v.begin(), - addresses_v.end(), addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (M > 0) { csr_neighbors_v.resize(M, handle.get_stream()); - kernel_rowScatterCSR<<>>( - predicates_v.data(), addresses_v.data(), csr_neighbors_v.data(), - csr_ptrs_v.data(), M, SP, N); + kernel_rowScatterCSR<<>>( + predicates_v.data(), + addresses_v.data(), + csr_neighbors_v.data(), + csr_ptrs_v.data(), + M, + SP, + N); CHECK_CUDA(handle.get_stream()); } } if (M > 0) { - coverZeroAndExpand(handle, d_costs_dev, csr_neighbors_v.data(), - csr_ptrs_v.data(), d_vertices_dev, d_row_data_dev, - d_col_data_dev, d_flag, SP, N, epsilon); + coverZeroAndExpand(handle, + d_costs_dev, + csr_neighbors_v.data(), + csr_ptrs_v.data(), + d_vertices_dev, + d_row_data_dev, + d_col_data_dev, + d_flag, + SP, + N, + epsilon); } return M; } -// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending on the presence of uncovered zeros. +// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending +// on the presence of uncovered zeros. template -inline void executeZeroCover(raft::handle_t const &handle, - weight_t const *d_costs_dev, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, bool *d_flag, - int SP, vertex_t N, weight_t epsilon) { +inline void executeZeroCover(raft::handle_t const& handle, + weight_t const* d_costs_dev, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + bool* d_flag, + int SP, + vertex_t N, + weight_t epsilon) +{ vertex_t M = 1; while (M > 0) { - M = zeroCoverIteration(handle, d_costs_dev, d_vertices_dev, d_row_data_dev, - d_col_data_dev, d_flag, SP, N, epsilon); + M = zeroCoverIteration( + handle, d_costs_dev, d_vertices_dev, d_row_data_dev, d_col_data_dev, d_flag, SP, N, epsilon); } } // Function for executing reverse pass of the maximum matching. template -inline void reversePass(raft::handle_t const &handle, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, int N) { +inline void reversePass(raft::handle_t const& handle, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + int N) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; std::size_t size = SP * N; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, size); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size); rmm::device_uvector predicates_v(size, handle.get_stream()); rmm::device_uvector addresses_v(size, handle.get_stream()); @@ -302,18 +355,19 @@ inline void reversePass(raft::handle_t const &handle, thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size); CHECK_CUDA(handle.get_stream()); // calculate total number of vertices. - std::size_t csr_size = - thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); + std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), - addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (csr_size > 0) { int total_blocks_1 = 0; @@ -324,14 +378,12 @@ inline void reversePass(raft::handle_t const &handle, rmm::device_uvector elements_v(csr_size, handle.get_stream()); - kernel_augmentScatter<<>>( + kernel_augmentScatter<<>>( elements_v.data(), predicates_v.data(), addresses_v.data(), size); CHECK_CUDA(handle.get_stream()); - kernel_reverseTraversal<<>>( + kernel_reverseTraversal<<>>( elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size); CHECK_CUDA(handle.get_stream()); } @@ -339,16 +391,17 @@ inline void reversePass(raft::handle_t const &handle, // Function for executing augmentation pass of the maximum matching. template -inline void augmentationPass(raft::handle_t const &handle, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, - int N) { +inline void augmentationPass(raft::handle_t const& handle, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + int N) +{ int total_blocks = 0; dim3 blocks_per_grid; dim3 threads_per_block; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP * N); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N); rmm::device_uvector predicates_v(SP * N, handle.get_stream()); rmm::device_uvector addresses_v(SP * N, handle.get_stream()); @@ -357,7 +410,9 @@ inline void augmentationPass(raft::handle_t const &handle, thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0}); // compact the reverse pass row vertices. - kernel_augmentPredicateConstruction<<>>( predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N); @@ -368,8 +423,8 @@ inline void augmentationPass(raft::handle_t const &handle, vertex_t row_ids_csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end()); // exclusive scan for calculating the scatter addresses. - thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(), - addresses_v.begin()); + thrust::exclusive_scan( + thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin()); if (row_ids_csr_size > 0) { int total_blocks_1 = 0; @@ -378,20 +433,20 @@ inline void augmentationPass(raft::handle_t const &handle, raft::lap::detail::calculateLinearDims( blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size); - rmm::device_uvector elements_v(row_ids_csr_size, - handle.get_stream()); + rmm::device_uvector elements_v(row_ids_csr_size, handle.get_stream()); - kernel_augmentScatter<<>>( - elements_v.data(), predicates_v.data(), addresses_v.data(), - vertex_t{SP * N}); + kernel_augmentScatter<<>>( + elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N}); CHECK_CUDA(handle.get_stream()); - kernel_augmentation<<>>( - d_vertices_dev.row_assignments, d_vertices_dev.col_assignments, - elements_v.data(), d_row_data_dev, d_col_data_dev, vertex_t{N}, + kernel_augmentation<<>>( + d_vertices_dev.row_assignments, + d_vertices_dev.col_assignments, + elements_v.data(), + d_row_data_dev, + d_col_data_dev, + vertex_t{N}, row_ids_csr_size); CHECK_CUDA(handle.get_stream()); @@ -399,34 +454,45 @@ inline void augmentationPass(raft::handle_t const &handle, } template -inline void dualUpdate(raft::handle_t const &handle, - Vertices &d_vertices_dev, - VertexData &d_row_data_dev, - VertexData &d_col_data_dev, int SP, vertex_t N, - weight_t epsilon) { +inline void dualUpdate(raft::handle_t const& handle, + Vertices& d_vertices_dev, + VertexData& d_row_data_dev, + VertexData& d_col_data_dev, + int SP, + vertex_t N, + weight_t epsilon) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks; rmm::device_scalar sp_min_v(handle.get_stream()); - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); - kernel_dualUpdate_1<<>>( - sp_min_v.data(), d_vertices_dev.col_slacks, d_vertices_dev.col_covers, SP, - N, std::numeric_limits::max()); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); + kernel_dualUpdate_1<<>>( + sp_min_v.data(), + d_vertices_dev.col_slacks, + d_vertices_dev.col_covers, + SP, + N, + std::numeric_limits::max()); CHECK_CUDA(handle.get_stream()); raft::lap::detail::calculateRectangularDims( blocks_per_grid, threads_per_block, total_blocks, N, SP); - kernel_dualUpdate_2<<>>( - sp_min_v.data(), d_vertices_dev.row_duals, d_vertices_dev.col_duals, - d_vertices_dev.col_slacks, d_vertices_dev.row_covers, - d_vertices_dev.col_covers, d_row_data_dev.is_visited, - d_col_data_dev.parents, SP, N, std::numeric_limits::max(), + kernel_dualUpdate_2<<>>( + sp_min_v.data(), + d_vertices_dev.row_duals, + d_vertices_dev.col_duals, + d_vertices_dev.col_slacks, + d_vertices_dev.row_covers, + d_vertices_dev.col_covers, + d_row_data_dev.is_visited, + d_col_data_dev.parents, + SP, + N, + std::numeric_limits::max(), epsilon); CHECK_CUDA(handle.get_stream()); @@ -434,18 +500,19 @@ inline void dualUpdate(raft::handle_t const &handle, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val, - Vertices &d_vertices_dev, int SP, - int N) { +inline void calcObjValDual(raft::handle_t const& handle, + weight_t* d_obj_val, + Vertices& d_vertices_dev, + int SP, + int N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); - kernel_calcObjValDual<<>>( + kernel_calcObjValDual<<>>( d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N); CHECK_CUDA(handle.get_stream()); @@ -453,20 +520,21 @@ inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val, // Function for calculating optimal objective function value using dual variables. template -inline void calcObjValPrimal(raft::handle_t const &handle, weight_t *d_obj_val, - weight_t const *d_costs, - vertex_t const *d_row_assignments, int SP, - vertex_t N) { +inline void calcObjValPrimal(raft::handle_t const& handle, + weight_t* d_obj_val, + weight_t const* d_costs, + vertex_t const* d_row_assignments, + int SP, + vertex_t N) +{ dim3 blocks_per_grid; dim3 threads_per_block; int total_blocks = 0; - raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, - total_blocks, SP); + raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP); - kernel_calcObjValPrimal<<>>(d_obj_val, d_costs, - d_row_assignments, SP, N); + kernel_calcObjValPrimal<<>>( + d_obj_val, d_costs, d_row_assignments, SP, N); CHECK_CUDA(handle.get_stream()); } diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh index 14ad877aa4..328cbf3e74 100644 --- a/cpp/include/raft/lap/lap_kernels.cuh +++ b/cpp/include/raft/lap/lap_kernels.cuh @@ -45,42 +45,57 @@ const int AUGMENT{4}; const int MODIFIED{5}; template -bool __device__ near_zero(weight_t w, weight_t epsilon) { +bool __device__ near_zero(weight_t w, weight_t epsilon) +{ return ((w > -epsilon) && (w < epsilon)); } template <> -bool __device__ near_zero(int32_t w, int32_t epsilon) { +bool __device__ near_zero(int32_t w, int32_t epsilon) +{ return (w == 0); } template <> -bool __device__ near_zero(int64_t w, int64_t epsilon) { +bool __device__ near_zero(int64_t w, int64_t epsilon) +{ return (w == 0); } -// Device function for traversing the neighbors from start pointer to end pointer and updating the covers. -// The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of Step 4 execution. +// Device function for traversing the neighbors from start pointer to end pointer and updating the +// covers. The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of +// Step 4 execution. template -__device__ void cover_and_expand_row( - weight_t const *d_elements, weight_t const *d_row_duals, - weight_t const *d_col_duals, weight_t *d_col_slacks, int *d_row_covers, - int *d_col_covers, vertex_t const *d_col_assignments, bool *d_flag, - vertex_t *d_row_parents, vertex_t *d_col_parents, int *d_row_visited, - int *d_col_visited, vertex_t rowid, int spid, int colid, vertex_t N, - weight_t epsilon) { +__device__ void cover_and_expand_row(weight_t const* d_elements, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + weight_t* d_col_slacks, + int* d_row_covers, + int* d_col_covers, + vertex_t const* d_col_assignments, + bool* d_flag, + vertex_t* d_row_parents, + vertex_t* d_col_parents, + int* d_row_visited, + int* d_col_visited, + vertex_t rowid, + int spid, + int colid, + vertex_t N, + weight_t epsilon) +{ int ROWID = spid * N + rowid; int COLID = spid * N + colid; - weight_t slack = d_elements[spid * N * N + rowid * N + colid] - - d_row_duals[ROWID] - d_col_duals[COLID]; + weight_t slack = + d_elements[spid * N * N + rowid * N + colid] - d_row_duals[ROWID] - d_col_duals[COLID]; int nxt_rowid = d_col_assignments[COLID]; int NXT_ROWID = spid * N + nxt_rowid; if (rowid != nxt_rowid && d_col_covers[COLID] == 0) { if (slack < d_col_slacks[COLID]) { - d_col_slacks[COLID] = slack; + d_col_slacks[COLID] = slack; d_col_parents[COLID] = ROWID; } @@ -89,13 +104,12 @@ __device__ void cover_and_expand_row( d_row_parents[NXT_ROWID] = COLID; // update parent info d_row_covers[NXT_ROWID] = 0; - d_col_covers[COLID] = 1; + d_col_covers[COLID] = 1; - if (d_row_visited[NXT_ROWID] != VISITED) - d_row_visited[NXT_ROWID] = ACTIVE; + if (d_row_visited[NXT_ROWID] != VISITED) d_row_visited[NXT_ROWID] = ACTIVE; } else { d_col_visited[COLID] = REVERSE; - *d_flag = true; + *d_flag = true; } } } @@ -104,28 +118,34 @@ __device__ void cover_and_expand_row( // Device function for traversing an alternating path from unassigned row to unassigned column. template -__device__ void __reverse_traversal( - int *d_row_visited, vertex_t *d_row_children, vertex_t *d_col_children, - vertex_t const *d_row_parents, vertex_t const *d_col_parents, int cur_colid) { +__device__ void __reverse_traversal(int* d_row_visited, + vertex_t* d_row_children, + vertex_t* d_col_children, + vertex_t const* d_row_parents, + vertex_t const* d_col_parents, + int cur_colid) +{ int cur_rowid = -1; while (cur_colid != -1) { d_col_children[cur_colid] = cur_rowid; - cur_rowid = d_col_parents[cur_colid]; + cur_rowid = d_col_parents[cur_colid]; d_row_children[cur_rowid] = cur_colid; - cur_colid = d_row_parents[cur_rowid]; + cur_colid = d_row_parents[cur_rowid]; } d_row_visited[cur_rowid] = AUGMENT; } // Device function for augmenting the alternating path from unassigned column to unassigned row. template -__device__ void __augment(vertex_t *d_row_assignments, - vertex_t *d_col_assignments, - vertex_t const *d_row_children, - vertex_t const *d_col_children, vertex_t cur_rowid, - vertex_t N) { +__device__ void __augment(vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + vertex_t const* d_row_children, + vertex_t const* d_col_children, + vertex_t cur_rowid, + vertex_t N) +{ int cur_colid = -1; while (cur_rowid != -1) { @@ -142,20 +162,18 @@ __device__ void __augment(vertex_t *d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_rowReduction(weight_t const *d_costs, - weight_t *d_row_duals, int SP, vertex_t N, - weight_t infinity) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; - int rowid = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void kernel_rowReduction( + weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; + int rowid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && rowid < N) { for (int colid = 0; colid < N; colid++) { weight_t slack = d_costs[spid * N * N + rowid * N + colid]; - if (slack < min) { - min = slack; - } + if (slack < min) { min = slack; } } d_row_duals[spid * N + rowid] = min; @@ -166,25 +184,26 @@ __global__ void kernel_rowReduction(weight_t const *d_costs, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_columnReduction(weight_t const *d_costs, - weight_t const *d_row_duals, - weight_t *d_col_duals, int SP, - vertex_t N, weight_t infinity) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_columnReduction(weight_t const* d_costs, + weight_t const* d_row_duals, + weight_t* d_col_duals, + int SP, + vertex_t N, + weight_t infinity) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; weight_t min = infinity; if (spid < SP && colid < N) { for (int rowid = 0; rowid < N; rowid++) { - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[spid * N + rowid]; weight_t slack = cost - row_dual; - if (slack < min) { - min = slack; - } + if (slack < min) { min = slack; } } d_col_duals[spid * N + colid] = min; @@ -193,12 +212,18 @@ __global__ void kernel_columnReduction(weight_t const *d_costs, // Kernel for calculating initial assignments. template -__global__ void kernel_computeInitialAssignments( - weight_t const *d_costs, weight_t const *d_row_duals, - weight_t const *d_col_duals, vertex_t *d_row_assignments, - vertex_t *d_col_assignments, int *d_row_lock, int *d_col_lock, int SP, - vertex_t N, weight_t epsilon) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeInitialAssignments(weight_t const* d_costs, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + int* d_row_lock, + int* d_col_lock, + int SP, + vertex_t N, + weight_t epsilon) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && colid < N) { @@ -210,15 +235,15 @@ __global__ void kernel_computeInitialAssignments( if (d_col_lock[overall_colid] == 1) break; - weight_t cost = d_costs[spid * N * N + rowid * N + colid]; + weight_t cost = d_costs[spid * N * N + rowid * N + colid]; weight_t row_dual = d_row_duals[overall_rowid]; - weight_t slack = cost - row_dual - col_dual; + weight_t slack = cost - row_dual - col_dual; if (near_zero(slack, epsilon)) { if (atomicCAS(&d_row_lock[overall_rowid], 0, 1) == 0) { d_row_assignments[overall_rowid] = colid; d_col_assignments[overall_colid] = rowid; - d_col_lock[overall_colid] = 1; + d_col_lock[overall_colid] = 1; } } } @@ -227,10 +252,10 @@ __global__ void kernel_computeInitialAssignments( // Kernel for populating the cover arrays and initializing alternating tree. template -__global__ void kernel_computeRowCovers(vertex_t *d_row_assignments, - int *d_row_covers, int *d_row_visited, - int SP, vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_computeRowCovers( + vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -246,11 +271,10 @@ __global__ void kernel_computeRowCovers(vertex_t *d_row_assignments, // Kernel for populating the predicate matrix for edges in row major format. template -__global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates, - vertex_t *d_addresses, - int *d_row_visited, int SP, - vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowPredicateConstructionCSR( + bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { @@ -258,130 +282,160 @@ __global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates, if (d_row_visited[index] == ACTIVE) { d_predicates[index] = true; - d_addresses[index] = 1; + d_addresses[index] = 1; } else { d_predicates[index] = false; - d_addresses[index] = 0; + d_addresses[index] = 0; } } } // Kernel for scattering the edges based on the scatter addresses. template -__global__ void kernel_rowScatterCSR(bool const *d_predicates, - vertex_t const *d_addresses, - vertex_t *d_neighbors, vertex_t *d_ptrs, - vertex_t M, int SP, vertex_t N) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; +__global__ void kernel_rowScatterCSR(bool const* d_predicates, + vertex_t const* d_addresses, + vertex_t* d_neighbors, + vertex_t* d_ptrs, + vertex_t M, + int SP, + vertex_t N) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int rowid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && rowid < N) { int index = spid * N + rowid; - bool predicate = d_predicates[index]; + bool predicate = d_predicates[index]; vertex_t compid = d_addresses[index]; - if (predicate) { - d_neighbors[compid] = rowid; - } + if (predicate) { d_neighbors[compid] = rowid; } if (rowid == 0) { d_ptrs[spid] = compid; - d_ptrs[SP] = M; + d_ptrs[SP] = M; } } } // Kernel for finding the minimum zero cover. template -__global__ void kernel_coverAndExpand(bool *d_flag, vertex_t const *d_ptrs, - vertex_t const *d_neighbors, - weight_t const *d_elements, +__global__ void kernel_coverAndExpand(bool* d_flag, + vertex_t const* d_ptrs, + vertex_t const* d_neighbors, + weight_t const* d_elements, Vertices d_vertices, VertexData d_row_data, - VertexData d_col_data, int SP, - vertex_t N, weight_t epsilon) { - int spid = blockIdx.y * blockDim.y + threadIdx.y; + VertexData d_col_data, + int SP, + vertex_t N, + weight_t epsilon) +{ + int spid = blockIdx.y * blockDim.y + threadIdx.y; int colid = blockIdx.x * blockDim.x + threadIdx.x; // Load values into local memory if (spid < SP && colid < N) { thrust::for_each( - thrust::seq, d_neighbors + d_ptrs[spid], d_neighbors + d_ptrs[spid + 1], - [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, - epsilon] __device__(vertex_t rowid) { - cover_and_expand_row( - d_elements, d_vertices.row_duals, d_vertices.col_duals, - d_vertices.col_slacks, d_vertices.row_covers, d_vertices.col_covers, - d_vertices.col_assignments, d_flag, d_row_data.parents, - d_col_data.parents, d_row_data.is_visited, d_col_data.is_visited, - rowid, spid, colid, N, epsilon); + thrust::seq, + d_neighbors + d_ptrs[spid], + d_neighbors + d_ptrs[spid + 1], + [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, epsilon] __device__( + vertex_t rowid) { + cover_and_expand_row(d_elements, + d_vertices.row_duals, + d_vertices.col_duals, + d_vertices.col_slacks, + d_vertices.row_covers, + d_vertices.col_covers, + d_vertices.col_assignments, + d_flag, + d_row_data.parents, + d_col_data.parents, + d_row_data.is_visited, + d_col_data.is_visited, + rowid, + spid, + colid, + N, + epsilon); }); } } // Kernel for constructing the predicates for reverse pass or augmentation candidates. template -__global__ void kernel_augmentPredicateConstruction(bool *d_predicates, - vertex_t *d_addresses, - int *d_visited, int size) { +__global__ void kernel_augmentPredicateConstruction(bool* d_predicates, + vertex_t* d_addresses, + int* d_visited, + int size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { int visited = d_visited[id]; if ((visited == REVERSE) || (visited == AUGMENT)) { d_predicates[id] = true; - d_addresses[id] = 1; + d_addresses[id] = 1; } else { d_predicates[id] = false; - d_addresses[id] = 0; + d_addresses[id] = 0; } } } // Kernel for scattering the vertices based on the scatter addresses. template -__global__ void kernel_augmentScatter(vertex_t *d_elements, - bool const *d_predicates, - vertex_t const *d_addresses, - std::size_t size) { +__global__ void kernel_augmentScatter(vertex_t* d_elements, + bool const* d_predicates, + vertex_t const* d_addresses, + std::size_t size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - if (d_predicates[id]) { - d_elements[d_addresses[id]] = id; - } + if (d_predicates[id]) { d_elements[d_addresses[id]] = id; } } } // Kernel for executing the reverse pass of the maximum matching algorithm. template -__global__ void kernel_reverseTraversal(vertex_t *d_elements, +__global__ void kernel_reverseTraversal(vertex_t* d_elements, VertexData d_row_data, VertexData d_col_data, - int size) { + int size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __reverse_traversal(d_row_data.is_visited, d_row_data.children, - d_col_data.children, d_row_data.parents, - d_col_data.parents, d_elements[id]); + __reverse_traversal(d_row_data.is_visited, + d_row_data.children, + d_col_data.children, + d_row_data.parents, + d_col_data.parents, + d_elements[id]); } } // Kernel for executing the augmentation pass of the maximum matching algorithm. template -__global__ void kernel_augmentation(vertex_t *d_row_assignments, - vertex_t *d_col_assignments, - vertex_t const *d_row_elements, +__global__ void kernel_augmentation(vertex_t* d_row_assignments, + vertex_t* d_col_assignments, + vertex_t const* d_row_elements, VertexData d_row_data, - VertexData d_col_data, vertex_t N, - vertex_t size) { + VertexData d_col_data, + vertex_t N, + vertex_t size) +{ int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < size) { - __augment(d_row_assignments, d_col_assignments, d_row_data.children, - d_col_data.children, d_row_elements[id], N); + __augment(d_row_assignments, + d_col_assignments, + d_row_data.children, + d_col_data.children, + d_row_elements[id], + N); } } @@ -389,18 +443,21 @@ __global__ void kernel_augmentation(vertex_t *d_row_assignments, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_1(weight_t *d_sp_min, - weight_t const *d_col_slacks, - int const *d_col_covers, int SP, vertex_t N, - weight_t infinity) { +__global__ void kernel_dualUpdate_1(weight_t* d_sp_min, + weight_t const* d_col_slacks, + int const* d_col_covers, + int SP, + vertex_t N, + weight_t infinity) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { weight_t min = infinity; for (int colid = 0; colid < N; colid++) { - int index = spid * N + colid; + int index = spid * N + colid; weight_t slack = d_col_slacks[index]; - int col_cover = d_col_covers[index]; + int col_cover = d_col_covers[index]; if (col_cover == 0) if (slack < min) min = slack; @@ -414,21 +471,29 @@ __global__ void kernel_dualUpdate_1(weight_t *d_sp_min, // FIXME: Once cuda 10.2 is the standard should replace passing infinity // here with using cuda::std::numeric_limits::max() template -__global__ void kernel_dualUpdate_2( - weight_t const *d_sp_min, weight_t *d_row_duals, weight_t *d_col_duals, - weight_t *d_col_slacks, int const *d_row_covers, int const *d_col_covers, - int *d_row_visited, vertex_t *d_col_parents, int SP, vertex_t N, - weight_t infinity, weight_t epsilon) { +__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min, + weight_t* d_row_duals, + weight_t* d_col_duals, + weight_t* d_col_slacks, + int const* d_row_covers, + int const* d_col_covers, + int* d_row_visited, + vertex_t* d_col_parents, + int SP, + vertex_t N, + weight_t infinity, + weight_t epsilon) +{ int spid = blockIdx.y * blockDim.y + threadIdx.y; - int id = blockIdx.x * blockDim.x + threadIdx.x; + int id = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP && id < N) { int index = spid * N + id; if (d_sp_min[spid] < infinity) { weight_t theta = d_sp_min[spid]; - int row_cover = d_row_covers[index]; - int col_cover = d_col_covers[index]; + int row_cover = d_row_covers[index]; + int col_cover = d_col_covers[index]; if (row_cover == 0) // Row vertex is reachable from source. d_row_duals[index] += theta; @@ -450,10 +515,12 @@ __global__ void kernel_dualUpdate_2( // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual, - weight_t const *d_row_duals, - weight_t const *d_col_duals, int SP, - vertex_t N) { +__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual, + weight_t const* d_row_duals, + weight_t const* d_col_duals, + int SP, + vertex_t N) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { @@ -468,10 +535,12 @@ __global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual, // Kernel for calculating optimal objective function value using dual variables. template -__global__ void kernel_calcObjValPrimal(weight_t *d_obj_val_primal, - weight_t const *d_costs, - vertex_t const *d_row_assignments, - int SP, vertex_t N) { +__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal, + weight_t const* d_costs, + vertex_t const* d_row_assignments, + int SP, + vertex_t N) +{ int spid = blockIdx.x * blockDim.x + threadIdx.x; if (spid < SP) { diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index 7a454f64e2..11d3174951 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -37,8 +37,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, - cudaStream_t stream) { +void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ auto op = [scalar] __device__(InT in) { return OutT(in + scalar); }; unaryOp(out, in, len, op, stream); } @@ -57,23 +57,24 @@ void addScalar(OutT *out, const InT *in, InT scalar, IdxType len, * @param stream cuda stream where to launch work */ template -void add(OutT *out, const InT *in1, const InT *in2, IdxType len, - cudaStream_t stream) { +void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ auto op = [] __device__(InT a, InT b) { return OutT(a + b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { +__global__ void add_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] + *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and + * write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -83,14 +84,16 @@ __global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev, * @param stream cuda stream */ template -void addDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void addDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // TODO: block dimension has not been tuned dim3 block(256); dim3 grid(raft::ceildiv(len, (IdxType)block.x)); - add_dev_scalar_kernel - <<>>(outDev, inDev, singleScalarDev, len); + add_dev_scalar_kernel<<>>(outDev, inDev, singleScalarDev, len); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index 940d786e87..a49a433941 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -22,10 +22,10 @@ namespace raft { namespace linalg { -template -__global__ void binaryOpKernel(OutType *out, const InType *in1, - const InType *in2, IdxType len, Lambda op) { +template +__global__ void binaryOpKernel( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op) +{ typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a, b; @@ -42,12 +42,11 @@ __global__ void binaryOpKernel(OutType *out, const InType *in1, c.store(out, idx); } -template -void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, - IdxType len, Lambda op, cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void binaryOpImpl( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); binaryOpKernel <<>>(out, in1, in2, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -56,8 +55,8 @@ void binaryOpImpl(OutType *out, const InType *in1, const InType *in2, /** * @brief Checks if addresses are aligned on N bytes */ -inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, - uint64_t N) { +inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N) +{ return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; } @@ -77,38 +76,36 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val1, const InType& val2);` */ -template -void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len, - Lambda op, cudaStream_t stream) { - constexpr auto maxSize = - sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t in1Addr = uint64_t(in1); - uint64_t in2Addr = uint64_t(in2); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 16)) { +template +void binaryOp( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t in1Addr = uint64_t(in1); + uint64_t in2Addr = uint64_t(in2); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 8)) { + } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 4)) { + } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) { binaryOpImpl( out, in1, in2, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && - addressAligned(in1Addr, in2Addr, outAddr, 2)) { + } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) { binaryOpImpl( out, in1, in2, len, op, stream); } else if (1 / maxSize) { binaryOpImpl( out, in1, in2, len, op, stream); } else { - binaryOpImpl(out, in1, in2, len, - op, stream); + binaryOpImpl(out, in1, in2, len, op, stream); } } diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh index d6d064c20e..4b58133ac5 100644 --- a/cpp/include/raft/linalg/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh @@ -122,9 +122,16 @@ namespace linalg { * conditioned systems. Negative values mean no regularizaton. */ template -void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, - void *workspace, int *n_bytes, cublasFillMode_t uplo, - cudaStream_t stream, math_t eps = -1) { +void choleskyRank1Update(const raft::handle_t& handle, + math_t* L, + int n, + int ld, + void* workspace, + int* n_bytes, + cublasFillMode_t uplo, + cudaStream_t stream, + math_t eps = -1) +{ // The matrix A' is defined as: // A' = [[A_11, A_12] // [A_21, A_22]] @@ -144,18 +151,17 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // We need a workspace in device memory to store a scalar. Additionally, in // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats. const int align = 256; - int offset = (uplo == CUBLAS_FILL_MODE_LOWER) - ? raft::alignTo(sizeof(math_t) * (n - 1), align) - : 0; + int offset = + (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo(sizeof(math_t) * (n - 1), align) : 0; if (workspace == nullptr) { *n_bytes = offset + 1 * sizeof(math_t); return; } - math_t *s = reinterpret_cast(((char *)workspace) + offset); - math_t *L_22 = L + (n - 1) * ld + n - 1; + math_t* s = reinterpret_cast(((char*)workspace) + offset); + math_t* L_22 = L + (n - 1) * ld + n - 1; - math_t *A_new; - math_t *A_row; + math_t* A_new; + math_t* A_row; if (uplo == CUBLAS_FILL_MODE_UPPER) { // A_new is stored as the n-1 th column of L A_new = L + (n - 1) * ld; @@ -164,27 +170,36 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // as the n-th row of L. Since the matrix is column major, this is non // contiguous. We copy elements from A_row to a contiguous workspace A_new. A_row = L + n - 1; - A_new = reinterpret_cast(workspace); - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_row, ld, A_new, 1, stream)); + A_new = reinterpret_cast(workspace); + CUBLAS_CHECK( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream)); } - cublasOperation_t op = - (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N; if (n > 1) { // Calculate L_12 = x by solving equation L_11 x = A_12 math_t alpha = 1; - CUBLAS_CHECK(raft::linalg::cublastrsm( - handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op, - CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream)); + CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(), + CUBLAS_SIDE_LEFT, + uplo, + op, + CUBLAS_DIAG_NON_UNIT, + n - 1, + 1, + &alpha, + L, + ld, + A_new, + n - 1, + stream)); // A_new now stores L_12, we calculate s = L_12 * L_12 - CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, - A_new, 1, A_new, 1, s, stream)); + CUBLAS_CHECK( + raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream)); if (uplo == CUBLAS_FILL_MODE_LOWER) { // Copy back the L_12 elements as the n-th row of L - CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, - A_new, 1, A_row, ld, stream)); + CUBLAS_CHECK( + raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream)); } } else { // n == 1 case CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream)); @@ -202,9 +217,7 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld, // the system is very ill conditioned then the A_22 - L_12 * L_12 can be // negative, which would result L_22 = NaN. A small positive eps parameter // can be used to prevent this. - if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { - L_22_host = eps; - } + if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; } ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update"); raft::update_device(L_22, &L_22_host, 1, stream); } diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh index ef983ff3d0..7e0744f98a 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -26,18 +26,27 @@ namespace linalg { // of the matrix, i.e. reduce along rows for row major or reduce along columns // for column major layout. Kernel does an inplace reduction adding to original // values of dots. -template -__global__ void coalescedReductionKernel(OutType *dots, const InType *data, - int D, int N, OutType init, +template +__global__ void coalescedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, MainLambda main_op, ReduceLambda reduce_op, FinalLambda final_op, - bool inplace = false) { + bool inplace = false) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType thread_data = init; - IdxType rowStart = blockIdx.x * D; + IdxType rowStart = blockIdx.x * D; for (IdxType i = threadIdx.x; i < D; i += TPB) { IdxType idx = rowStart + i; thread_data = reduce_op(thread_data, main_op(data[idx], i)); @@ -79,33 +88,37 @@ __global__ void coalescedReductionKernel(OutType *dots, const InType *data, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void coalescedReduction(OutType *dots, const InType *data, int D, int N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void coalescedReduction(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ // One block per reduction // Efficient only for large leading dimensions if (D <= 32) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else if (D <= 64) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else if (D <= 128) { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } else { coalescedReductionKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, - final_op, inplace); + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh index e6ff8a49ce..817bfeab5c 100644 --- a/cpp/include/raft/linalg/contractions.cuh +++ b/cpp/include/raft/linalg/contractions.cuh @@ -55,8 +55,7 @@ namespace linalg { * thread block. This also determines the number of threads per * thread block */ -template +template struct KernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -101,8 +100,7 @@ struct KernelPolicy { }; // struct KernelPolicy -template +template struct ColKernelPolicy { enum { /** number of elements along K worked upon per main loop iteration */ @@ -151,7 +149,8 @@ struct ColKernelPolicy { * @{ */ template -struct Policy4x4 {}; +struct Policy4x4 { +}; template struct Policy4x4 { @@ -171,7 +170,8 @@ struct Policy4x4 { * @{ */ template -struct Policy2x8 {}; +struct Policy2x8 { +}; template struct Policy2x8 { @@ -201,8 +201,7 @@ struct Policy2x8 { * @tparam Policy policy used to customize memory access behavior. * See documentation for `KernelPolicy` to know more. */ -template +template struct Contractions_NT { protected: typedef Policy P; @@ -268,8 +267,7 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, char* _smem) + DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem) : m(_m), n(_n), k(_k), @@ -286,7 +284,9 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) {} + pageRd(0) + { + } /** * @brief Ctor @@ -297,8 +297,15 @@ struct Contractions_NT { * @param[in] _k number of cols of X and Y * @param[in] _smem shared memory region used during computations */ - DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, - IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem) + DI Contractions_NT(const DataT* _x, + const DataT* _y, + IdxT _m, + IdxT _n, + IdxT _k, + IdxT _lda, + IdxT _ldb, + IdxT _ldd, + char* _smem) : m(_m), n(_n), k(_k), @@ -312,17 +319,18 @@ struct Contractions_NT { sx((DataT*)_smem), sy(&(sx[P::SmemPageX])), pageWr(0), - pageRd(0) { + pageRd(0) + { if (isRowMajor) { xrowid = IdxT(blockIdx.y) * P::Mblk + srowid; yrowid = IdxT(blockIdx.x) * P::Nblk + srowid; - x = _x + xrowid * lda; - y = _y + yrowid * ldb; + x = _x + xrowid * lda; + y = _y + yrowid * ldb; } else { xrowid = IdxT(blockIdx.y) * P::Mblk; yrowid = IdxT(blockIdx.x) * P::Nblk; - x = _x + xrowid + srowid * lda; - y = _y + yrowid + srowid * ldb; + x = _x + xrowid + srowid * lda; + y = _y + yrowid + srowid * ldb; } } @@ -331,7 +339,8 @@ struct Contractions_NT { * @brief Load current block of X/Y from global memory to registers * @param[in] kidx current start index of k to be loaded */ - DI void ldgXY(IdxT kidx) { + DI void ldgXY(IdxT kidx) + { ldgX(kidx); ldgY(kidx); } @@ -340,7 +349,8 @@ struct Contractions_NT { * @brief Store current block of X/Y from registers to smem * @param[in] kidx current start index of k to be loaded */ - DI void stsXY() { + DI void stsXY() + { stsX(sx + pageWr * P::SmemPage); stsY(sy + pageWr * P::SmemPage); } @@ -349,13 +359,15 @@ struct Contractions_NT { * @brief Load X and Y block from shared memory to registers * @param[in] kidx k value from the current k-block to be loaded from smem */ - DI void ldsXY(int kidx) { + DI void ldsXY(int kidx) + { ldsX(kidx, sx + pageRd * P::SmemPage); ldsY(kidx, sy + pageRd * P::SmemPage); } private: - DI void ldgX(IdxT kidx) { + DI void ldgX(IdxT kidx) + { if (isRowMajor) { auto numRows = m; auto koffset = kidx + scolid; @@ -372,11 +384,10 @@ struct Contractions_NT { } } else { const auto numRows = k; - auto koffset = scolid; + auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { - if ((koffset + xrowid) < lda && - (srowid + kidx + i * P::LdgRowsX) < numRows) { + if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) { ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset); } else { #pragma unroll @@ -388,7 +399,8 @@ struct Contractions_NT { } } - DI void ldgY(IdxT kidx) { + DI void ldgY(IdxT kidx) + { if (isRowMajor) { auto numRows = n; auto koffset = kidx + scolid; @@ -408,8 +420,7 @@ struct Contractions_NT { auto koffset = scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { - if ((koffset + yrowid) < ldb && - (srowid + kidx + i * P::LdgRowsY) < numRows) { + if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) { ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset); } else { #pragma unroll @@ -421,7 +432,8 @@ struct Contractions_NT { } } - DI void stsX(DataT* smem) { + DI void stsX(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThX; ++i) { @@ -429,7 +441,8 @@ struct Contractions_NT { } } - DI void stsY(DataT* smem) { + DI void stsY(DataT* smem) + { auto* saddr = smem + srowid * P::SmemStride + scolid; #pragma unroll for (int i = 0; i < P::LdgPerThY; ++i) { @@ -437,7 +450,8 @@ struct Contractions_NT { } } - DI void ldsX(int kidx, DataT* smem) { + DI void ldsX(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + accrowid * P::SmemStride + kidx; #pragma unroll @@ -456,7 +470,8 @@ struct Contractions_NT { } } - DI void ldsY(int kidx, DataT* smem) { + DI void ldsY(int kidx, DataT* smem) + { if (isRowMajor) { auto* saddr = smem + acccolid * P::SmemStride + kidx; #pragma unroll diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h index 1be14a550d..3616d54506 100644 --- a/cpp/include/raft/linalg/cublas_wrappers.h +++ b/cpp/include/raft/linalg/cublas_wrappers.h @@ -25,8 +25,7 @@ #include #define _CUBLAS_ERR_TO_STR(err) \ - case err: \ - return #err + case err: return #err namespace raft { @@ -34,15 +33,15 @@ namespace raft { * @brief Exception thrown when a cuBLAS error is encountered. */ struct cublas_error : public raft::exception { - explicit cublas_error(char const *const message) : raft::exception(message) {} - explicit cublas_error(std::string const &message) - : raft::exception(message) {} + explicit cublas_error(char const* const message) : raft::exception(message) {} + explicit cublas_error(std::string const& message) : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char *cublas_error_to_string(cublasStatus_t err) { +inline const char* cublas_error_to_string(cublasStatus_t err) +{ switch (err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED); @@ -54,8 +53,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED); _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR); - default: - return "CUBLAS_STATUS_UNKNOWN"; + default: return "CUBLAS_STATUS_UNKNOWN"; }; } @@ -71,29 +69,34 @@ inline const char *cublas_error_to_string(cublasStatus_t err) { * Invokes a cuBLAS runtime API function call, if the call does not return * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred */ -#define CUBLAS_TRY(call) \ - do { \ - cublasStatus_t const status = (call); \ - if (CUBLAS_STATUS_SUCCESS != status) { \ - std::string msg{}; \ - SET_ERROR_MSG( \ - msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s", \ - #call, status, raft::linalg::detail::cublas_error_to_string(status)); \ - throw raft::cublas_error(msg); \ - } \ +#define CUBLAS_TRY(call) \ + do { \ + cublasStatus_t const status = (call); \ + if (CUBLAS_STATUS_SUCCESS != status) { \ + std::string msg{}; \ + SET_ERROR_MSG(msg, \ + "cuBLAS error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ + raft::linalg::detail::cublas_error_to_string(status)); \ + throw raft::cublas_error(msg); \ + } \ } while (0) /** FIXME: temporary alias for cuML compatibility */ #define CUBLAS_CHECK(call) CUBLAS_TRY(call) /** check for cublas runtime API errors but do not assert */ -#define CUBLAS_CHECK_NO_THROW(call) \ - do { \ - cublasStatus_t err = call; \ - if (err != CUBLAS_STATUS_SUCCESS) { \ - CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \ - raft::linalg::detail::cublas_error_to_string(err)); \ - } \ +#define CUBLAS_CHECK_NO_THROW(call) \ + do { \ + cublasStatus_t err = call; \ + if (err != CUBLAS_STATUS_SUCCESS) { \ + CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", \ + #call, \ + err, \ + raft::linalg::detail::cublas_error_to_string(err)); \ + } \ } while (0) namespace raft { @@ -104,22 +107,39 @@ namespace linalg { * @{ */ template -cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha, - const T *x, int incx, T *y, int incy, +cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const T* alpha, + const T* x, + int incx, + T* y, + int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, - const float *alpha, const float *x, int incx, - float *y, int incy, cudaStream_t stream) { +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const float* alpha, + const float* x, + int incx, + float* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSaxpy(handle, n, alpha, x, incx, y, incy); } template <> -inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, - const double *alpha, const double *x, int incx, - double *y, int incy, cudaStream_t stream) { +inline cublasStatus_t cublasaxpy(cublasHandle_t handle, + int n, + const double* alpha, + const double* x, + int incx, + double* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } @@ -130,21 +150,21 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, * @{ */ template -cublasStatus_t cublasSwap(cublasHandle_t handle, int n, T *x, int incx, T *y, - int incy, cudaStream_t stream); +cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, T* x, int incx, T* y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, float *x, - int incx, float *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSswap(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x, - int incx, double *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasSwap( + cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDswap(handle, n, x, incx, y, incy); } @@ -156,20 +176,20 @@ inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x, * @{ */ template -cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx, - T *y, int incy, cudaStream_t stream); +cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy, cudaStream_t stream); template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x, - int incx, float *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasScopy(handle, n, x, incx, y, incy); } template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, - int incx, double *y, int incy, - cudaStream_t stream) { +inline cublasStatus_t cublasCopy( + cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDcopy(handle, n, x, incx, y, incy); } @@ -180,31 +200,56 @@ inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, * @{ */ template -cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA, - int m, int n, const T *alfa, const T *A, int lda, - const T *x, int incx, const T *beta, T *y, int incy, +cublasStatus_t cublasgemv(cublasHandle_t handle, + cublasOperation_t transA, + int m, + int n, + const T* alfa, + const T* A, + int lda, + const T* x, + int incx, + const T* beta, + T* y, + int incy, cudaStream_t stream); template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, int m, int n, - const float *alfa, const float *A, int lda, - const float *x, int incx, const float *beta, - float *y, int incy, cudaStream_t stream) { + cublasOperation_t transA, + int m, + int n, + const float* alfa, + const float* A, + int lda, + const float* x, + int incx, + const float* beta, + float* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, - incy); + return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } template <> inline cublasStatus_t cublasgemv(cublasHandle_t handle, - cublasOperation_t transA, int m, int n, - const double *alfa, const double *A, int lda, - const double *x, int incx, const double *beta, - double *y, int incy, cudaStream_t stream) { + cublasOperation_t transA, + int m, + int n, + const double* alfa, + const double* A, + int lda, + const double* x, + int incx, + const double* beta, + double* y, + int incy, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, - incy); + return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy); } /** @} */ @@ -213,23 +258,47 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha, - const T *x, int incx, const T *y, int incy, T *A, - int lda, cudaStream_t stream); +cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const T* alpha, + const T* x, + int incx, + const T* y, + int incy, + T* A, + int lda, + cudaStream_t stream); template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, - const float *alpha, const float *x, int incx, - const float *y, int incy, float *A, int lda, - cudaStream_t stream) { +inline cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const float* alpha, + const float* x, + int incx, + const float* y, + int incy, + float* A, + int lda, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda); } template <> -inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, - const double *alpha, const double *x, int incx, - const double *y, int incy, double *A, int lda, - cudaStream_t stream) { +inline cublasStatus_t cublasger(cublasHandle_t handle, + int m, + int n, + const double* alpha, + const double* x, + int incx, + const double* y, + int incy, + double* A, + int lda, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda); } @@ -240,34 +309,62 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, * @{ */ template -cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const T *alfa, const T *A, int lda, const T *B, - int ldb, const T *beta, T *C, int ldc, +cublasStatus_t cublasgemm(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, + int m, + int n, + int k, + const T* alfa, + const T* A, + int lda, + const T* B, + int ldb, + const T* beta, + T* C, + int ldc, cudaStream_t stream); template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const float *alfa, const float *A, int lda, - const float *B, int ldb, const float *beta, - float *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + int k, + const float* alfa, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, - beta, C, ldc); + return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } template <> inline cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, int k, - const double *alfa, const double *A, int lda, - const double *B, int ldb, const double *beta, - double *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + int k, + const double* alfa, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, - beta, C, ldc); + return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -278,38 +375,93 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle, template cublasStatus_t cublasgemmBatched(cublasHandle_t handle, // NOLINT cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const T *alpha, - const T *const Aarray[], // NOLINT - int lda, const T *const Barray[], // NOLINT - int ldb, const T *beta, - T *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream); + cublasOperation_t transb, + int m, + int n, + int k, + const T* alpha, + const T* const Aarray[], // NOLINT + int lda, + const T* const Barray[], // NOLINT + int ldb, + const T* beta, + T* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream); template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const float *alpha, - const float *const Aarray[], // NOLINT - int lda, const float *const Barray[], // NOLINT - int ldb, const float *beta, float *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* const Aarray[], // NOLINT + int lda, + const float* const Barray[], // NOLINT + int ldb, + const float* beta, + float* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasSgemmBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + Barray, + ldb, + beta, + Carray, + ldc, + batchCount); } template <> inline cublasStatus_t cublasgemmBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const double *alpha, - const double *const Aarray[], // NOLINT - int lda, const double *const Barray[], // NOLINT - int ldb, const double *beta, double *Carray[], // NOLINT - int ldc, int batchCount, cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* const Aarray[], // NOLINT + int lda, + const double* const Barray[], // NOLINT + int ldb, + const double* beta, + double* Carray[], // NOLINT + int ldc, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda, - Barray, ldb, beta, Carray, ldc, batchCount); + return cublasDgemmBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + Barray, + ldb, + beta, + Carray, + ldc, + batchCount); } /** @} */ @@ -319,36 +471,110 @@ inline cublasStatus_t cublasgemmBatched( // NOLINT */ template cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const T *alpha, const T *const Aarray, int lda, - int64_t strideA, const T *const Barray, int ldb, int64_t strideB, - const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount, + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const T* alpha, + const T* const Aarray, + int lda, + int64_t strideA, + const T* const Barray, + int ldb, + int64_t strideB, + const T* beta, + T* Carray, + int ldc, + int64_t strideC, + int batchCount, cudaStream_t stream); template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const float *alpha, const float *const Aarray, int lda, - int64_t strideA, const float *const Barray, int ldb, int64_t strideB, - const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount, - cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* const Aarray, + int lda, + int64_t strideA, + const float* const Barray, + int ldb, + int64_t strideB, + const float* beta, + float* Carray, + int ldc, + int64_t strideC, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, - Aarray, lda, strideA, Barray, ldb, strideB, - beta, Carray, ldc, strideC, batchCount); + return cublasSgemmStridedBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + strideA, + Barray, + ldb, + strideB, + beta, + Carray, + ldc, + strideC, + batchCount); } template <> inline cublasStatus_t cublasgemmStridedBatched( // NOLINT - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const double *alpha, const double *const Aarray, int lda, - int64_t strideA, const double *const Barray, int ldb, int64_t strideB, - const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount, - cudaStream_t stream) { + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* const Aarray, + int lda, + int64_t strideA, + const double* const Barray, + int ldb, + int64_t strideB, + const double* beta, + double* Carray, + int ldc, + int64_t strideC, + int batchCount, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, - Aarray, lda, strideA, Barray, ldb, strideB, - beta, Carray, ldc, strideC, batchCount); + return cublasDgemmStridedBatched(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + lda, + strideA, + Barray, + ldb, + strideB, + beta, + Carray, + ldc, + strideC, + batchCount); } /** @} */ @@ -358,51 +584,85 @@ inline cublasStatus_t cublasgemmStridedBatched( // NOLINT */ template -cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n, // NOLINT - T *const A[], // NOLINT - int lda, int *P, int *info, int batchSize, +cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, + int n, // NOLINT + T* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, float *const A[], // NOLINT - int lda, int *P, int *info, - int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, + float* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize); } template <> -inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT - int n, double *const A[], // NOLINT - int lda, int *P, int *info, - int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, // NOLINT + int n, + double* const A[], // NOLINT + int lda, + int* P, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize); } template -cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n, // NOLINT - const T *const A[], // NOLINT - int lda, const int *P, - T *const C[], // NOLINT - int ldc, int *info, int batchSize, +cublasStatus_t cublasgetriBatched(cublasHandle_t handle, + int n, // NOLINT + const T* const A[], // NOLINT + int lda, + const int* P, + T* const C[], // NOLINT + int ldc, + int* info, + int batchSize, cudaStream_t stream); template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, int n, const float *const A[], // NOLINT - int lda, const int *P, float *const C[], // NOLINT - int ldc, int *info, int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, + int n, + const float* const A[], // NOLINT + int lda, + const int* P, + float* const C[], // NOLINT + int ldc, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } template <> -inline cublasStatus_t cublasgetriBatched( // NOLINT - cublasHandle_t handle, int n, const double *const A[], // NOLINT - int lda, const int *P, double *const C[], // NOLINT - int ldc, int *info, int batchSize, cudaStream_t stream) { +inline cublasStatus_t cublasgetriBatched( // NOLINT + cublasHandle_t handle, + int n, + const double* const A[], // NOLINT + int lda, + const int* P, + double* const C[], // NOLINT + int ldc, + int* info, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize); } @@ -416,34 +676,57 @@ inline cublasStatus_t cublasgetriBatched( // NOLINT template inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, T *Aarray[], // NOLINT - int lda, T *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream); + cublasOperation_t trans, + int m, + int n, + int nrhs, + T* Aarray[], // NOLINT + int lda, + T* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream); template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, float *Aarray[], // NOLINT - int lda, float *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream) { + cublasOperation_t trans, + int m, + int n, + int nrhs, + float* Aarray[], // NOLINT + int lda, + float* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, - info, devInfoArray, batchSize); + return cublasSgelsBatched( + handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } template <> inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT - cublasOperation_t trans, int m, int n, - int nrhs, double *Aarray[], // NOLINT - int lda, double *Carray[], // NOLINT - int ldc, int *info, int *devInfoArray, - int batchSize, cudaStream_t stream) { + cublasOperation_t trans, + int m, + int n, + int nrhs, + double* Aarray[], // NOLINT + int lda, + double* Carray[], // NOLINT + int ldc, + int* info, + int* devInfoArray, + int batchSize, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, - info, devInfoArray, batchSize); + return cublasDgelsBatched( + handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize); } /** @} */ @@ -453,33 +736,59 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle, // NOLINT * @{ */ template -cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, const T *alfa, - const T *A, int lda, const T *beta, const T *B, - int ldb, T *C, int ldc, cudaStream_t stream); +cublasStatus_t cublasgeam(cublasHandle_t handle, + cublasOperation_t transA, + cublasOperation_t transB, + int m, + int n, + const T* alfa, + const T* A, + int lda, + const T* beta, + const T* B, + int ldb, + T* C, + int ldc, + cudaStream_t stream); template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, - const float *alfa, const float *A, int lda, - const float *beta, const float *B, int ldb, - float *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + const float* alfa, + const float* A, + int lda, + const float* beta, + const float* B, + int ldb, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, - C, ldc); + return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } template <> inline cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA, - cublasOperation_t transB, int m, int n, - const double *alfa, const double *A, int lda, - const double *beta, const double *B, int ldb, - double *C, int ldc, cudaStream_t stream) { + cublasOperation_t transB, + int m, + int n, + const double* alfa, + const double* A, + int lda, + const double* beta, + const double* B, + int ldb, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, - C, ldc); + return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc); } /** @} */ @@ -488,31 +797,59 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, const T *alpha, - const T *A, int lda, const T *B, int ldb, - const T *beta, T *C, int ldc, cudaStream_t stream); +cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const T* alpha, + const T* A, + int lda, + const T* B, + int ldb, + const T* beta, + T* C, + int ldc, + cudaStream_t stream); template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, - const float *alpha, const float *A, int lda, - const float *B, int ldb, const float *beta, - float *C, int ldc, cudaStream_t stream) { +inline cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const float* alpha, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, - ldc); + return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, int m, int n, - const double *alpha, const double *A, int lda, - const double *B, int ldb, const double *beta, - double *C, int ldc, cudaStream_t stream) { +inline cublasStatus_t cublassymm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const double* alpha, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, - ldc); + return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); } /** @} */ @@ -521,27 +858,51 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side, * @{ */ template -cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, const T *alpha, - const T *A, int lda, const T *beta, T *C, int ldc, +cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const T* alpha, + const T* A, + int lda, + const T* beta, + T* C, + int ldc, cudaStream_t stream); template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, - const float *alpha, const float *A, int lda, - const float *beta, float *C, int ldc, - cudaStream_t stream) { +inline cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, + const float* A, + int lda, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } template <> -inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, - cublasOperation_t trans, int n, int k, - const double *alpha, const double *A, int lda, - const double *beta, double *C, int ldc, - cudaStream_t stream) { +inline cublasStatus_t cublassyrk(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, + const double* A, + int lda, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); } @@ -552,52 +913,77 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo, * @{ */ template -cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx, - T *result, cudaStream_t stream); +cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const T* x, int incx, T* result, cudaStream_t stream); template <> -inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x, - int incx, float *result, cudaStream_t stream) { +inline cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSnrm2(handle, n, x, incx, result); } template <> -inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x, - int incx, double *result, - cudaStream_t stream) { +inline cublasStatus_t cublasnrm2( + cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDnrm2(handle, n, x, incx, result); } /** @} */ template -cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, const T *alpha, - const T *A, int lda, T *B, int ldb, +cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const T* alpha, + const T* A, + int lda, + T* B, + int ldb, cudaStream_t stream); template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, - const float *alpha, const float *A, int lda, - float *B, int ldb, cudaStream_t stream) { +inline cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, + const float* A, + int lda, + float* B, + int ldb, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, - ldb); + return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } template <> -inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, cublasOperation_t trans, - cublasDiagType_t diag, int m, int n, - const double *alpha, const double *A, int lda, - double *B, int ldb, cudaStream_t stream) { +inline cublasStatus_t cublastrsm(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, + const double* A, + int lda, + double* B, + int ldb, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); - return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, - ldb); + return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); } /** @@ -605,21 +991,39 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side, * @{ */ template -cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx, - const T *y, int incy, T *result, cudaStream_t stream); +cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const T* x, + int incx, + const T* y, + int incy, + T* result, + cudaStream_t stream); template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x, - int incx, const float *y, int incy, - float *result, cudaStream_t stream) { +inline cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const float* x, + int incx, + const float* y, + int incy, + float* result, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSdot(handle, n, x, incx, y, incy, result); } template <> -inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, - int incx, const double *y, int incy, - double *result, cudaStream_t stream) { +inline cublasStatus_t cublasdot(cublasHandle_t handle, + int n, + const double* x, + int incx, + const double* y, + int incy, + double* result, + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDdot(handle, n, x, incx, y, incy, result); } @@ -639,7 +1043,8 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x, // template<> inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, cublasPointerMode_t mode, - cudaStream_t stream) { + cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSetPointerMode(handle, mode); } @@ -650,21 +1055,21 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle, * @{ */ template -cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x, - int incx, cudaStream_t stream); +cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const T* alpha, T* x, int incx, cudaStream_t stream); template <> -inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, - const float *alpha, float *x, int incx, - cudaStream_t stream) { +inline cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasSscal(handle, n, alpha, x, incx); } template <> -inline cublasStatus_t cublasscal(cublasHandle_t handle, int n, - const double *alpha, double *x, int incx, - cudaStream_t stream) { +inline cublasStatus_t cublasscal( + cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream) +{ CUBLAS_CHECK(cublasSetStream(handle, stream)); return cublasDscal(handle, n, alpha, x, incx); } diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h index 6aa5e74455..85f2740647 100644 --- a/cpp/include/raft/linalg/cusolver_wrappers.h +++ b/cpp/include/raft/linalg/cusolver_wrappers.h @@ -24,8 +24,7 @@ #include #define _CUSOLVER_ERR_TO_STR(err) \ - case err: \ - return #err; + case err: return #err; namespace raft { @@ -33,16 +32,15 @@ namespace raft { * @brief Exception thrown when a cuSOLVER error is encountered. */ struct cusolver_error : public raft::exception { - explicit cusolver_error(char const *const message) - : raft::exception(message) {} - explicit cusolver_error(std::string const &message) - : raft::exception(message) {} + explicit cusolver_error(char const* const message) : raft::exception(message) {} + explicit cusolver_error(std::string const& message) : raft::exception(message) {} }; namespace linalg { namespace detail { -inline const char *cusolver_error_to_string(cusolverStatus_t err) { +inline const char* cusolver_error_to_string(cusolverStatus_t err) +{ switch (err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED); @@ -54,8 +52,7 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT); _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED); - default: - return "CUSOLVER_STATUS_UNKNOWN"; + default: return "CUSOLVER_STATUS_UNKNOWN"; }; } @@ -76,8 +73,11 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) { cusolverStatus_t const status = (call); \ if (CUSOLVER_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ", \ - "call='%s', Reason=%d:%s", #call, status, \ + SET_ERROR_MSG(msg, \ + "cuSOLVER error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ raft::linalg::detail::cusolver_error_to_string(status)); \ throw raft::cusolver_error(msg); \ } \ @@ -107,42 +107,76 @@ namespace linalg { * @{ */ template -cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m, // NOLINT - int n, T *A, int lda, T *Workspace, - int *devIpiv, int *devInfo, +cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, + int m, // NOLINT + int n, + T* A, + int lda, + T* Workspace, + int* devIpiv, + int* devInfo, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, float *A, int lda, - float *Workspace, int *devIpiv, - int *devInfo, cudaStream_t stream) { + int m, + int n, + float* A, + int lda, + float* Workspace, + int* devIpiv, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template <> inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, double *A, int lda, - double *Workspace, int *devIpiv, - int *devInfo, cudaStream_t stream) { + int m, + int n, + double* A, + int lda, + double* Workspace, + int* devIpiv, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo); } template cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); + cusolverDnHandle_t handle, + int m, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork); } @@ -152,30 +186,49 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, int nrhs, - const T *A, int lda, const int *devIpiv, T *B, - int ldb, int *devInfo, cudaStream_t stream); + cublasOperation_t trans, + int n, + int nrhs, + const T* A, + int lda, + const int* devIpiv, + T* B, + int ldb, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, - int nrhs, const float *A, int lda, - const int *devIpiv, float *B, int ldb, - int *devInfo, cudaStream_t stream) { + cublasOperation_t trans, + int n, + int nrhs, + const float* A, + int lda, + const int* devIpiv, + float* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, - devInfo); + return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } template <> inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT - cublasOperation_t trans, int n, - int nrhs, const double *A, int lda, - const int *devIpiv, double *B, int ldb, - int *devInfo, cudaStream_t stream) { + cublasOperation_t trans, + int n, + int nrhs, + const double* A, + int lda, + const int* devIpiv, + double* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, - devInfo); + return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo); } /** @} */ @@ -185,20 +238,40 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const T *A, int lda, const T *W, int *lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + const T* W, + int* lwork); template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const float *A, int lda, const float *W, int *lwork) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + const float* W, + int* lwork) +{ return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const double *A, int lda, const double *W, int *lwork) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + const double* W, + int* lwork) +{ return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork); } /** @} */ @@ -209,52 +282,96 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, T *A, int lda, T *W, T *work, int lwork, - int *info, syevjInfo_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* W, + T* work, + int lwork, + int* info, + syevjInfo_t params, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, float *A, int lda, float *W, float *work, int lwork, int *info, - syevjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* W, + float* work, + int lwork, + int* info, + syevjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, - params); + return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } template <> inline cusolverStatus_t cusolverDnsyevj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, double *A, int lda, double *W, double *work, int lwork, int *info, - syevjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* W, + double* work, + int lwork, + int* info, + syevjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, - params); + return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params); } template cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + const T* W, + int* lwork, + syevjInfo_t params); template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const float *A, int lda, const float *W, int *lwork, - syevjInfo_t params) { - return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, - params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + const float* W, + int* lwork, + syevjInfo_t params) +{ + return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } template <> inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, const double *A, int lda, const double *W, int *lwork, - syevjInfo_t params) { - return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, - params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + const double* W, + int* lwork, + syevjInfo_t params) +{ + return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params); } /** @} */ @@ -264,32 +381,49 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT - cusolverEigMode_t jobz, cublasFillMode_t uplo, - int n, T *A, int lda, T *W, T *work, int lwork, - int *devInfo, cudaStream_t stream); + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* W, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, int n, float *A, - int lda, float *W, float *work, - int lwork, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* W, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, - devInfo); + return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT cusolverEigMode_t jobz, - cublasFillMode_t uplo, int n, double *A, - int lda, double *W, double *work, - int lwork, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* W, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, - devInfo); + return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo); } /** @} */ @@ -297,57 +431,134 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle, // NOLINT /** * @defgroup syevdx cusolver syevdx operations * @{ -*/ + */ template cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu, - int *h_meig, const T *W, int *lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const T* A, + int lda, + T vl, + T vu, + int il, + int iu, + int* h_meig, + const T* W, + int* lwork); template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu, - int il, int iu, int *h_meig, const float *W, int *lwork) { - return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, - vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const float* A, + int lda, + float vl, + float vu, + int il, + int iu, + int* h_meig, + const float* W, + int* lwork) +{ + return cusolverDnSsyevdx_bufferSize( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } template <> inline cusolverStatus_t cusolverDnsyevdx_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu, - int il, int iu, int *h_meig, const double *W, int *lwork) { - return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl, - vu, il, iu, h_meig, W, lwork); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + const double* A, + int lda, + double vl, + double vu, + int il, + int iu, + int* h_meig, + const double* W, + int* lwork) +{ + return cusolverDnDsyevdx_bufferSize( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork); } template cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu, - int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T vl, + T vu, + int il, + int iu, + int* h_meig, + T* W, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il, - int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float vl, + float vu, + int il, + int iu, + int* h_meig, + float* W, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, - h_meig, W, work, lwork, devInfo); + return cusolverDnSsyevdx( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnsyevdx( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range, - cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu, - int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + cusolverEigRange_t range, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double vl, + double vu, + int il, + int iu, + int* h_meig, + double* W, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, - h_meig, W, work, lwork, devInfo); + return cusolverDnDsyevdx( + handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo); } /** @} */ #endif @@ -358,7 +569,11 @@ inline cusolverStatus_t cusolverDnsyevdx( // NOLINT */ template cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int* lwork) +{ if (std::is_same, float>::value) { return cusolverDnSgesvd_bufferSize(handle, m, n, lwork); } else { @@ -367,72 +582,194 @@ cusolverStatus_t cusolverDngesvd_bufferSize( // NOLINT } template cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork, - T *rwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + T* A, + int lda, + T* S, + T* U, + int ldu, + T* VT, + int ldvt, + T* work, + int lwork, + T* rwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt, - float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + float* A, + int lda, + float* S, + float* U, + int ldu, + float* VT, + int ldvt, + float* work, + int lwork, + float* rwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, - ldvt, work, lwork, rwork, devInfo); + return cusolverDnSgesvd( + handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } template <> inline cusolverStatus_t cusolverDngesvd( // NOLINT - cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n, - double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt, - double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + signed char jobu, + signed char jobvt, + int m, + int n, + double* A, + int lda, + double* S, + double* U, + int ldu, + double* VT, + int ldvt, + double* work, + int lwork, + double* rwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, - ldvt, work, lwork, rwork, devInfo); + return cusolverDnDgesvd( + handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv, - int *lwork, gesvdjInfo_t params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const T* A, + int lda, + const T* S, + const T* U, + int ldu, + const T* V, + int ldv, + int* lwork, + gesvdjInfo_t params); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const float *A, int lda, const float *S, const float *U, int ldu, - const float *V, int ldv, int *lwork, gesvdjInfo_t params) { - return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, - ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const float* A, + int lda, + const float* S, + const float* U, + int ldu, + const float* V, + int ldv, + int* lwork, + gesvdjInfo_t params) +{ + return cusolverDnSgesvdj_bufferSize( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - const double *A, int lda, const double *S, const double *U, int ldu, - const double *V, int ldv, int *lwork, gesvdjInfo_t params) { - return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U, - ldu, V, ldv, lwork, params); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + const double* A, + int lda, + const double* S, + const double* U, + int ldu, + const double* V, + int ldv, + int* lwork, + gesvdjInfo_t params) +{ + return cusolverDnDgesvdj_bufferSize( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params); } template inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork, - int *info, gesvdjInfo_t params, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + T* A, + int lda, + T* S, + T* U, + int ldu, + T* V, + int ldv, + T* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream); template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - float *A, int lda, float *S, float *U, int ldu, float *V, int ldv, - float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + float* A, + int lda, + float* S, + float* U, + int ldu, + float* V, + int ldv, + float* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, - work, lwork, info, params); + return cusolverDnSgesvdj( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } template <> inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT - cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n, - double *A, int lda, double *S, double *U, int ldu, double *V, int ldv, - double *work, int lwork, int *info, gesvdjInfo_t params, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverEigMode_t jobz, + int econ, + int m, + int n, + double* A, + int lda, + double* S, + double* U, + int ldu, + double* V, + int ldv, + double* work, + int lwork, + int* info, + gesvdjInfo_t params, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, - work, lwork, info, params); + return cusolverDnDgesvdj( + handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params); } /** @} */ @@ -442,43 +779,74 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj( // NOLINT */ template cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda, - int *Lwork); + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda, - int *Lwork) { + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDnpotrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda, - int *Lwork) { + cusolverDnHandle_t handle, + cublasFillMode_t uplo, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork); } template inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, T *A, - int lda, T *Workspace, int Lwork, - int *devInfo, cudaStream_t stream); + cublasFillMode_t uplo, + int n, + T* A, + int lda, + T* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, float *A, - int lda, float *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + float* A, + int lda, + float* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, double *A, - int lda, double *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + double* A, + int lda, + double* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo); } @@ -490,26 +858,44 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle, // NOLINT */ template cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const T *A, int lda, T *B, int ldb, - int *devInfo, cudaStream_t stream); + cublasFillMode_t uplo, + int n, + int nrhs, + const T* A, + int lda, + T* B, + int ldb, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const float *A, int lda, float *B, - int ldb, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + int nrhs, + const float* A, + int lda, + float* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } template <> inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT - cublasFillMode_t uplo, int n, int nrhs, - const double *A, int lda, double *B, - int ldb, int *devInfo, - cudaStream_t stream) { + cublasFillMode_t uplo, + int n, + int nrhs, + const double* A, + int lda, + double* B, + int ldb, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo); } @@ -520,38 +906,75 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle, // NOLINT * @{ */ template -cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m, // NOLINT - int n, T *A, int lda, T *TAU, T *Workspace, - int Lwork, int *devInfo, cudaStream_t stream); +cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, + int m, // NOLINT + int n, + T* A, + int lda, + T* TAU, + T* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, float *A, int lda, - float *TAU, float *Workspace, int Lwork, - int *devInfo, cudaStream_t stream) { + int m, + int n, + float* A, + int lda, + float* TAU, + float* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template <> inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, // NOLINT - int m, int n, double *A, int lda, - double *TAU, double *Workspace, - int Lwork, int *devInfo, - cudaStream_t stream) { + int m, + int n, + double* A, + int lda, + double* TAU, + double* Workspace, + int Lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo); } template cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork); + cusolverDnHandle_t handle, + int m, + int n, + T* A, + int lda, + int* Lwork); template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + float* A, + int lda, + int* Lwork) +{ return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } template <> inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) { + cusolverDnHandle_t handle, + int m, + int n, + double* A, + int lda, + int* Lwork) +{ return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork); } /** @} */ @@ -562,38 +985,86 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau, - T *work, int lwork, int *devInfo, cudaStream_t stream); + cusolverDnHandle_t handle, + int m, + int n, + int k, + T* A, + int lda, + const T* tau, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda, - const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + float* A, + int lda, + const float* tau, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnorgqr( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda, - const double *tau, double *work, int lwork, int *devInfo, - cudaStream_t stream) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + double* A, + int lda, + const double* tau, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo); } template cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda, - const T *TAU, int *lwork); + cusolverDnHandle_t handle, + int m, + int n, + int k, + const T* A, + int lda, + const T* TAU, + int* lwork); template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda, - const float *TAU, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + const float* A, + int lda, + const float* TAU, + int* lwork) +{ return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } template <> inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda, - const double *TAU, int *lwork) { + cusolverDnHandle_t handle, + int m, + int n, + int k, + const double* A, + int lda, + const double* TAU, + int* lwork) +{ return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork); } /** @} */ @@ -604,53 +1075,114 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle, // NOLINT - cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const T *A, int lda, - const T *tau, T *C, int ldc, T *work, - int lwork, int *devInfo, cudaStream_t stream); + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const T* A, + int lda, + const T* tau, + T* C, + int ldc, + T* work, + int lwork, + int* devInfo, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const float *A, int lda, const float *tau, float *C, - int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const float* A, + int lda, + const float* tau, + float* C, + int ldc, + float* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, - work, lwork, devInfo); + return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } template <> inline cusolverStatus_t cusolverDnormqr( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const double *A, int lda, const double *tau, double *C, - int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) { + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const double* A, + int lda, + const double* tau, + double* C, + int ldc, + double* work, + int lwork, + int* devInfo, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, - work, lwork, devInfo); + return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo); } template cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc, - int *lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const T* A, + int lda, + const T* tau, + const T* C, + int ldc, + int* lwork); template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const float *A, int lda, const float *tau, - const float *C, int ldc, int *lwork) { - return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, - C, ldc, lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const float* A, + int lda, + const float* tau, + const float* C, + int ldc, + int* lwork) +{ + return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } template <> inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT - cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans, - int m, int n, int k, const double *A, int lda, const double *tau, - const double *C, int ldc, int *lwork) { - return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, - C, ldc, lwork); + cusolverDnHandle_t handle, + cublasSideMode_t side, + cublasOperation_t trans, + int m, + int n, + int k, + const double* A, + int lda, + const double* tau, + const double* C, + int ldc, + int* lwork) +{ + return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); } /** @} */ @@ -660,62 +1192,136 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize( // NOLINT */ template cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes); template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes) { - return cusolverSpScsrqrBufferInfoBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, - info, internalDataInBytes, workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes) +{ + return cusolverSpScsrqrBufferInfoBatched(handle, + m, + n, + nnzA, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + batchSize, + info, + internalDataInBytes, + workspaceInBytes); } template <> inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, - const int *csrColIndA, int batchSize, csrqrInfo_t info, - size_t *internalDataInBytes, size_t *workspaceInBytes) { - return cusolverSpDcsrqrBufferInfoBatched( - handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize, - info, internalDataInBytes, workspaceInBytes); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + int batchSize, + csrqrInfo_t info, + size_t* internalDataInBytes, + size_t* workspaceInBytes) +{ + return cusolverSpDcsrqrBufferInfoBatched(handle, + m, + n, + nnzA, + descrA, + csrValA, + csrRowPtrA, + csrColIndA, + batchSize, + info, + internalDataInBytes, + workspaceInBytes); } template cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info, - void *pBuffer, cudaStream_t stream); + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* b, + T* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const float *b, float *x, int batchSize, - csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* b, + float* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, - csrRowPtrA, csrColIndA, b, x, batchSize, - info, pBuffer); + return cusolverSpScsrqrsvBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } template <> inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT - cusolverSpHandle_t handle, int m, int n, int nnzA, - const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA, - const int *csrColIndA, const double *b, double *x, int batchSize, - csrqrInfo_t info, void *pBuffer, cudaStream_t stream) { + cusolverSpHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* b, + double* x, + int batchSize, + csrqrInfo_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverSpSetStream(handle, stream)); - return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA, - csrRowPtrA, csrColIndA, b, x, batchSize, - info, pBuffer); + return cusolverSpDcsrqrsvBatched( + handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer); } /** @} */ @@ -726,66 +1332,165 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched( // NOLINT */ template cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, const T *A, int64_t lda, const T *W, - size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost, + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + const T* A, + int64_t lda, + const T* W, + size_t* workspaceInBytesOnDevice, + size_t* workspaceInBytesOnHost, cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, const float *A, int64_t lda, const float *W, - size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost, - cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + const float* A, + int64_t lda, + const float* W, + size_t* workspaceInBytesOnDevice, + size_t* workspaceInBytesOnHost, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnXsyevd_bufferSize( - handle, params, jobz, uplo, n, CUDA_R_32F, A, lda, CUDA_R_32F, W, - CUDA_R_32F, workspaceInBytesOnDevice, workspaceInBytesOnHost); + return cusolverDnXsyevd_bufferSize(handle, + params, + jobz, + uplo, + n, + CUDA_R_32F, + A, + lda, + CUDA_R_32F, + W, + CUDA_R_32F, + workspaceInBytesOnDevice, + workspaceInBytesOnHost); } template <> inline cusolverStatus_t cusolverDnxsyevd_bufferSize( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, const double *A, int64_t lda, - const double *W, size_t *workspaceInBytesOnDevice, - size_t *workspaceInBytesOnHost, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + const double* A, + int64_t lda, + const double* W, + size_t* workspaceInBytesOnDevice, + size_t* workspaceInBytesOnHost, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnXsyevd_bufferSize( - handle, params, jobz, uplo, n, CUDA_R_64F, A, lda, CUDA_R_64F, W, - CUDA_R_64F, workspaceInBytesOnDevice, workspaceInBytesOnHost); + return cusolverDnXsyevd_bufferSize(handle, + params, + jobz, + uplo, + n, + CUDA_R_64F, + A, + lda, + CUDA_R_64F, + W, + CUDA_R_64F, + workspaceInBytesOnDevice, + workspaceInBytesOnHost); } template cusolverStatus_t cusolverDnxsyevd( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, T *A, int64_t lda, T *W, T *bufferOnDevice, - size_t workspaceInBytesOnDevice, T *bufferOnHost, - size_t workspaceInBytesOnHost, int *info, cudaStream_t stream); + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + T* A, + int64_t lda, + T* W, + T* bufferOnDevice, + size_t workspaceInBytesOnDevice, + T* bufferOnHost, + size_t workspaceInBytesOnHost, + int* info, + cudaStream_t stream); template <> inline cusolverStatus_t cusolverDnxsyevd( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, float *A, int64_t lda, float *W, - float *bufferOnDevice, size_t workspaceInBytesOnDevice, float *bufferOnHost, - size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + float* A, + int64_t lda, + float* W, + float* bufferOnDevice, + size_t workspaceInBytesOnDevice, + float* bufferOnHost, + size_t workspaceInBytesOnHost, + int* info, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_32F, A, lda, - CUDA_R_32F, W, CUDA_R_32F, bufferOnDevice, - workspaceInBytesOnDevice, bufferOnHost, - workspaceInBytesOnHost, info); + return cusolverDnXsyevd(handle, + params, + jobz, + uplo, + n, + CUDA_R_32F, + A, + lda, + CUDA_R_32F, + W, + CUDA_R_32F, + bufferOnDevice, + workspaceInBytesOnDevice, + bufferOnHost, + workspaceInBytesOnHost, + info); } template <> inline cusolverStatus_t cusolverDnxsyevd( // NOLINT - cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, - cublasFillMode_t uplo, int64_t n, double *A, int64_t lda, double *W, - double *bufferOnDevice, size_t workspaceInBytesOnDevice, double *bufferOnHost, - size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) { + cusolverDnHandle_t handle, + cusolverDnParams_t params, + cusolverEigMode_t jobz, + cublasFillMode_t uplo, + int64_t n, + double* A, + int64_t lda, + double* W, + double* bufferOnDevice, + size_t workspaceInBytesOnDevice, + double* bufferOnHost, + size_t workspaceInBytesOnHost, + int* info, + cudaStream_t stream) +{ CUSOLVER_CHECK(cusolverDnSetStream(handle, stream)); - return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_64F, A, lda, - CUDA_R_64F, W, CUDA_R_64F, bufferOnDevice, - workspaceInBytesOnDevice, bufferOnHost, - workspaceInBytesOnHost, info); + return cusolverDnXsyevd(handle, + params, + jobz, + uplo, + n, + CUDA_R_64F, + A, + lda, + CUDA_R_64F, + W, + CUDA_R_64F, + bufferOnDevice, + workspaceInBytesOnDevice, + bufferOnHost, + workspaceInBytesOnHost, + info); } /** @} */ #endif diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh index c848ac1f4b..562a3d8991 100644 --- a/cpp/include/raft/linalg/divide.cuh +++ b/cpp/include/raft/linalg/divide.cuh @@ -33,11 +33,10 @@ namespace linalg { * @{ */ template -void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, - cudaStream_t stream) { +void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, - stream); + out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index e141883b6c..288d379dac 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -29,25 +29,42 @@ namespace raft { namespace linalg { template -void eigDC_legacy(const raft::handle_t &handle, const math_t *in, - std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors, - math_t *eig_vals, cudaStream_t stream) { +void eigDC_legacy(const raft::handle_t& handle, + const math_t* in, + std::size_t n_rows, + std::size_t n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; - CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, in, - n_cols, eig_vals, &lwork)); + CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + eig_vals, + &lwork)); rmm::device_uvector d_work(lwork, stream); rmm::device_scalar d_dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); CUDA_CHECK(cudaGetLastError()); auto dev_info = d_dev_info.value(stream); @@ -70,9 +87,14 @@ void eigDC_legacy(const raft::handle_t &handle, const math_t *in, * @{ */ template -void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows, - std::size_t n_cols, math_t *eig_vectors, math_t *eig_vals, - cudaStream_t stream) { +void eigDC(const raft::handle_t& handle, + const math_t* in, + std::size_t n_rows, + std::size_t n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream) +{ #if CUDART_VERSION < 11010 eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); #else @@ -82,11 +104,18 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows, CUSOLVER_CHECK(cusolverDnCreateParams(&dn_params)); size_t workspaceDevice = 0; - size_t workspaceHost = 0; - CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize( - cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, - static_cast(n_rows), eig_vectors, static_cast(n_cols), - eig_vals, &workspaceDevice, &workspaceHost, stream)); + size_t workspaceHost = 0; + CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(cusolverH, + dn_params, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + static_cast(n_rows), + eig_vectors, + static_cast(n_cols), + eig_vals, + &workspaceDevice, + &workspaceHost, + stream)); rmm::device_uvector d_work(workspaceDevice / sizeof(math_t), stream); rmm::device_scalar d_dev_info(stream); @@ -94,11 +123,20 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows, raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnxsyevd( - cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, - static_cast(n_rows), eig_vectors, static_cast(n_cols), - eig_vals, d_work.data(), workspaceDevice, h_work.data(), workspaceHost, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnxsyevd(cusolverH, + dn_params, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + static_cast(n_rows), + eig_vectors, + static_cast(n_cols), + eig_vals, + d_work.data(), + workspaceDevice, + h_work.data(), + workspaceHost, + d_dev_info.data(), + stream)); CUDA_CHECK(cudaGetLastError()); CUSOLVER_CHECK(cusolverDnDestroyParams(dn_params)); @@ -128,38 +166,79 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT }; * @{ */ template -void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - int n_eig_vals, math_t *eig_vectors, math_t *eig_vals, - EigVecMemUsage memUsage, cudaStream_t stream) { +void eigSelDC(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + int n_eig_vals, + math_t* eig_vectors, + math_t* eig_vals, + EigVecMemUsage memUsage, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int lwork; int h_meig; - CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork)); + CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + &lwork)); rmm::device_uvector d_work(lwork, stream); rmm::device_scalar d_dev_info(stream); rmm::device_uvector d_eig_vectors(0, stream); if (memUsage == OVERWRITE_INPUT) { - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0), - n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork, - d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + in, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } else if (memUsage == COPY_INPUT) { d_eig_vectors.resize(n_rows * n_cols, stream); raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevdx( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0), - math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, - d_work.data(), lwork, d_dev_info.data(), stream)); + CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUSOLVER_EIG_RANGE_I, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + math_t(0.0), + math_t(0.0), + n_cols - n_eig_vals + 1, + n_cols, + &h_meig, + eig_vals, + d_work.data(), + lwork, + d_dev_info.data(), + stream)); } CUDA_CHECK(cudaGetLastError()); @@ -170,11 +249,10 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, "This usually occurs when some of the features do not vary enough."); if (memUsage == OVERWRITE_INPUT) { - raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, - stream); + raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, stream); } else if (memUsage == COPY_INPUT) { - raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors, - n_rows, n_eig_vals, stream); + raft::matrix::truncZeroOrigin( + d_eig_vectors.data(), n_rows, eig_vectors, n_rows, n_eig_vals, stream); } } @@ -195,36 +273,54 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @{ */ template -void eigJacobi(const raft::handle_t &handle, const math_t *in, - std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors, - math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7, - std::uint32_t sweeps = 15) { +void eigJacobi(const raft::handle_t& handle, + const math_t* in, + std::size_t n_rows, + std::size_t n_cols, + math_t* eig_vectors, + math_t* eig_vals, + cudaStream_t stream, + math_t tol = 1.e-7, + std::uint32_t sweeps = 15) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); syevjInfo_t syevj_params = nullptr; CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params)); CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol)); - CUSOLVER_CHECK( - cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast(sweeps))); + CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast(sweeps))); int lwork; - CUSOLVER_CHECK(cusolverDnsyevj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows, - eig_vectors, n_cols, eig_vals, &lwork, syevj_params)); + CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + &lwork, + syevj_params)); rmm::device_uvector d_work(lwork, stream); rmm::device_scalar dev_info(stream); raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream); - CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, - CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, - n_cols, eig_vals, d_work.data(), lwork, - dev_info.data(), syevj_params, stream)); + CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + CUBLAS_FILL_MODE_UPPER, + n_rows, + eig_vectors, + n_cols, + eig_vals, + d_work.data(), + lwork, + dev_info.data(), + syevj_params, + stream)); int executed_sweeps; - CUSOLVER_CHECK( - cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); + CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps)); CUDA_CHECK(cudaGetLastError()); CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params)); diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh index 1c6dee562d..097c3ac218 100644 --- a/cpp/include/raft/linalg/eltwise.cuh +++ b/cpp/include/raft/linalg/eltwise.cuh @@ -34,19 +34,17 @@ namespace linalg { * @{ */ template -void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in + scalar; }, - stream); + out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream); } template -void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ raft::linalg::unaryOp( - out, in, len, [scalar] __device__(InType in) { return in * scalar; }, - stream); + out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream); } /** @} */ @@ -62,42 +60,46 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len, * @{ */ template -void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseAdd( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); } template -void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len, - cudaStream_t stream) { +void eltwiseSub( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream); } template -void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseMultiply( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream); } template -void eltwiseDivide(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivide( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream); } template -void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void eltwiseDivideCheckZero( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, + out, + in1, + in2, + len, [] __device__(InType a, InType b) { if (b == InType(0.0)) return InType(0.0); diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh index 0a4897cc0b..d5942b7446 100644 --- a/cpp/include/raft/linalg/gemm.cuh +++ b/cpp/include/raft/linalg/gemm.cuh @@ -43,35 +43,53 @@ namespace linalg { * @param stream cuda stream */ template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha, - math_t beta, cudaStream_t stream) { +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + math_t alpha, + math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); - int m = n_rows_c; - int n = n_cols_c; - int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; + int m = n_rows_c; + int n = n_cols_c; + int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a; int lda = trans_a == CUBLAS_OP_T ? k : m; int ldb = trans_b == CUBLAS_OP_T ? n : k; int ldc = m; - CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, - b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK( + cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } template -void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, - int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c, - cublasOperation_t trans_a, cublasOperation_t trans_b, - cudaStream_t stream) { +void gemm(const raft::handle_t& handle, + const math_t* a, + int n_rows_a, + int n_cols_a, + const math_t* b, + math_t* c, + int n_rows_c, + int n_cols_c, + cublasOperation_t trans_a, + cublasOperation_t trans_b, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); - gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, - trans_b, alpha, beta, stream); + math_t beta = math_t(0); + gemm( + handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream); } /** - * @brief A wrapper for CUBLS GEMM function designed for handling all possible + * @brief A wrapper for CUBLS GEMM function designed for handling all possible * combinations of operand layouts. * It computes the following equation: Z = alpha . X * Y + beta . Z * @tparam T Data type of input/output matrices (float/double) @@ -90,9 +108,20 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a, * @param beta scalar */ template -void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, - int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor, - cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) { +void gemm(const raft::handle_t& handle, + T* z, + T* x, + T* y, + int _M, + int _N, + int _K, + bool isZColMajor, + bool isXColMajor, + bool isYColMajor, + cudaStream_t stream, + T alpha = T(1.0), + T beta = T(0.0)) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); cublasOperation_t trans_a, trans_b; @@ -119,13 +148,13 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major // layout, trans_b needs to be CUBLAS_OP_N. trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T; - ldb = isYColMajor == true ? _K : _N; + ldb = isYColMajor == true ? _K : _N; - c = z; + c = z; ldc = _M; - M = _M; - N = _N; - K = _K; + M = _M; + N = _N; + K = _K; } else { // Result c is required in row major layout Thus we pick // a = y, b = x and c = a * b = y * x @@ -154,7 +183,7 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, // Set leading dimension appropriately ldb = isXColMajor == true ? _M : _K; - c = z; + c = z; ldc = _N; M = _N; @@ -162,8 +191,8 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N, K = _K; } // Actual cuBLAS call - CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, - b, ldb, &beta, c, ldc, stream)); + CUBLAS_CHECK( + cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream)); } } // end namespace linalg diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h index 0be11a0301..ac0547e30a 100644 --- a/cpp/include/raft/linalg/gemv.h +++ b/cpp/include/raft/linalg/gemv.h @@ -26,14 +26,23 @@ namespace raft { namespace linalg { template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows, - const int n_cols, const math_t *x, const int incx, math_t *y, - const int incy, const bool trans_a, const math_t alpha, - const math_t beta, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows, + const int n_cols, + const math_t* x, + const int incx, + math_t* y, + const int incy, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, - incx, &beta, y, incy, stream)); + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK( + cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream)); } /** @@ -53,9 +62,17 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows, * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. */ template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, - const int n_cols_a, const math_t *x, math_t *y, const bool trans_a, - const math_t alpha, const math_t beta, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const math_t* x, + math_t* y, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } @@ -72,11 +89,17 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. */ template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, - const int n_cols_a, const math_t *x, math_t *y, const bool trans_a, - cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const math_t* x, + math_t* y, + const bool trans_a, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); + math_t beta = math_t(0); gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream); } @@ -102,14 +125,22 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`. */ template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, - const int n_cols_a, const int lda, const math_t *x, math_t *y, - const bool trans_a, const math_t alpha, const math_t beta, - cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const int lda, + const math_t* x, + math_t* y, + const bool trans_a, + const math_t alpha, + const math_t beta, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); - cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, - 1, &beta, y, 1, stream)); + cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK( + cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream)); } /** @@ -130,11 +161,18 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, * */ template -void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a, - const int n_cols_a, const int lda, const math_t *x, math_t *y, - const bool trans_a, cudaStream_t stream) { +void gemv(const raft::handle_t& handle, + const math_t* A, + const int n_rows_a, + const int n_cols_a, + const int lda, + const math_t* x, + math_t* y, + const bool trans_a, + cudaStream_t stream) +{ math_t alpha = math_t(1); - math_t beta = math_t(0); + math_t beta = math_t(0); gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream); } diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h index 9944685a1f..41ef4d4641 100644 --- a/cpp/include/raft/linalg/init.h +++ b/cpp/include/raft/linalg/init.h @@ -37,7 +37,8 @@ namespace { * \param [in] stream cuda stream */ template -void range(T *out, int start, int end, cudaStream_t stream) { +void range(T* out, int start, int end, cudaStream_t stream) +{ thrust::counting_iterator first(start); thrust::counting_iterator last = first + (end - start); thrust::device_ptr ptr(out); @@ -54,7 +55,8 @@ void range(T *out, int start, int end, cudaStream_t stream) { * \param [in] stream cuda stream */ template -void range(T *out, int n, cudaStream_t stream) { +void range(T* out, int n, cudaStream_t stream) +{ range(out, 0, n, stream); } } // unnamed namespace diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp index b775a1f696..39089473e3 100644 --- a/cpp/include/raft/linalg/lanczos.hpp +++ b/cpp/include/raft/linalg/lanczos.hpp @@ -16,7 +16,7 @@ #pragma once -//for cmath: +// for cmath: #define _USE_MATH_DEFINES #include @@ -40,14 +40,14 @@ using namespace linalg; namespace spectral { // curandGeneratorNormalX -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - float *outputPtr, size_t n, - float mean, float stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev) +{ return curandGenerateNormal(generator, outputPtr, n, mean, stddev); } -inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, - double *outputPtr, size_t n, - double mean, double stddev) { +inline curandStatus_t curandGenerateNormalX( + curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev) +{ return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); } @@ -55,7 +55,7 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, // Helper functions // ========================================================= -/** +/** * @brief Perform Lanczos iteration * Lanczos iteration is performed on a shifted matrix A+shift*I. * @tparam index_type_t the type of data used for indexing. @@ -85,25 +85,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator, * @return Zero if successful. Otherwise non-zero. */ template -int performLanczosIteration( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t *iter, index_type_t maxIter, value_type_t shift, - value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev) { +int performLanczosIteration(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t* iter, + index_type_t maxIter, + value_type_t shift, + value_type_t tol, + bool reorthogonalize, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful variables - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t negOne = -1; - constexpr value_type_t zero = 0; + constexpr value_type_t zero = 0; value_type_t alpha; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); RAFT_EXPECTS(A != nullptr, "Null matrix pointer."); @@ -117,29 +122,28 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev, + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, + lanczosVecs_dev, n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n); // Orthogonalize Lanczos vector - CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, - stream)); + CUBLAS_CHECK(cublasdot( + cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream)); alpha = -alpha_host[0]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1, - lanczosVecs_dev + IDX(0, 1, n), 1, stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, - beta_host, stream)); + CUBLAS_CHECK(cublasaxpy( + cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream)); // Check if Lanczos has converged if (beta_host[0] <= tol) return 0; // Normalize Lanczos vector alpha = 1 / beta_host[0]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), - 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream)); } // ------------------------------------------------------- @@ -151,65 +155,121 @@ int performLanczosIteration( // Apply matrix if (shift != 0) - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n, - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); - A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, - lanczosVecs_dev + IDX(0, *iter, n)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n, + lanczosVecs_dev + (*iter - 1) * n, + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); + A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n)); // Full reorthogonalization // "Twice is enough" algorithm per Kahan and Parlett if (reorthogonalize) { - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); - - CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1), - sizeof(value_type_t), cudaMemcpyDeviceToHost, + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); + + CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), + work_dev + (*iter - 1), + sizeof(value_type_t), + cudaMemcpyDeviceToHost, stream)); - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n, - lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream)); - - CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne, - lanczosVecs_dev, n, work_dev, 1, &one, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_T, + n, + *iter, + &one, + lanczosVecs_dev, + n, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + &zero, + work_dev, + 1, + stream)); + + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + *iter, + &negOne, + lanczosVecs_dev, + n, + work_dev, + 1, + &one, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Orthogonalization with 3-term recurrence relation else { - CUBLAS_CHECK(cublasdot(cublas_h, n, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, - alpha_host + (*iter - 1), stream)); + CUBLAS_CHECK(cublasdot(cublas_h, + n, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + alpha_host + (*iter - 1), + stream)); auto alpha = -alpha_host[*iter - 1]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 1, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 1, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); alpha = -beta_host[*iter - 2]; - CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter - 2, n), 1, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasaxpy(cublas_h, + n, + &alpha, + lanczosVecs_dev + IDX(0, *iter - 2, n), + 1, + lanczosVecs_dev + IDX(0, *iter, n), + 1, + stream)); } // Compute residual - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, - beta_host + *iter - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream)); // Check if Lanczos has converged if (beta_host[*iter - 1] <= tol) break; // Normalize Lanczos vector alpha = 1 / beta_host[*iter - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, - lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream)); } CUDA_TRY(cudaStreamSynchronize(stream)); @@ -217,7 +277,7 @@ int performLanczosIteration( return 0; } -/** +/** * @brief Find Householder transform for 3-dimensional system * Given an input vector v=[x,y,z]', this function finds a * Householder transform P such that P*v is a multiple of @@ -235,8 +295,8 @@ int performLanczosIteration( * matrix. Matrix dimensions are 3 x 3. */ template -static void findHouseholder3(value_type_t *v, value_type_t *Pv, - value_type_t *P) { +static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P) +{ // Compute norm of vector *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); @@ -246,8 +306,7 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, v[0] -= *Pv; // Normalize Householder vector - value_type_t normHouseholder = - std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); + value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); if (normHouseholder != 0) { v[0] /= normHouseholder; v[1] /= normHouseholder; @@ -261,11 +320,13 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, // Construct Householder matrix index_type_t i, j; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j]; - for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1; + for (i = 0; i < 3; ++i) + P[IDX(i, j, 3)] = -2 * v[i] * v[j]; + for (i = 0; i < 3; ++i) + P[IDX(i, i, 3)] += 1; } -/** +/** * @brief Apply 3-dimensional Householder transform to 4 x 4 matrix * The Householder transform is pre-applied to the top three rows * of the matrix and post-applied to the left three columns. The @@ -277,7 +338,8 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv, * @param A (Input/output, host memory, 16 entries) 4 x 4 matrix. */ template -static void applyHouseholder3(const value_type_t *v, value_type_t *A) { +static void applyHouseholder3(const value_type_t* v, value_type_t* A) +{ // Loop indices index_type_t i, j; // Dot product between Householder vector and matrix row/column @@ -286,19 +348,23 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { // Pre-apply Householder transform for (j = 0; j < 4; ++j) { vDotA = 0; - for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)]; - for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; + for (i = 0; i < 3; ++i) + vDotA += v[i] * A[IDX(i, j, 4)]; + for (i = 0; i < 3; ++i) + A[IDX(i, j, 4)] -= 2 * v[i] * vDotA; } // Post-apply Householder transform for (i = 0; i < 4; ++i) { vDotA = 0; - for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j]; - for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; + for (j = 0; j < 3; ++j) + vDotA += A[IDX(i, j, 4)] * v[j]; + for (j = 0; j < 3; ++j) + A[IDX(i, j, 4)] -= 2 * vDotA * v[j]; } } -/** +/** * @brief Perform one step of Francis QR algorithm * Equivalent to two steps of the classical QR algorithm on a * tridiagonal matrix. @@ -319,10 +385,14 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) { * @return Zero if successful. Otherwise non-zero. */ template -static int francisQRIteration(index_type_t n, value_type_t shift1, - value_type_t shift2, value_type_t *alpha, - value_type_t *beta, value_type_t *V, - value_type_t *work) { +static int francisQRIteration(index_type_t n, + value_type_t shift1, + value_type_t shift2, + value_type_t* alpha, + value_type_t* beta, + value_type_t* V, + value_type_t* work) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- @@ -352,30 +422,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c; householder[1] = beta[0] * (alpha[0] + alpha[1] + b); householder[2] = beta[0] * beta[1]; - findHouseholder3(householder, &temp, - householderMatrix); + findHouseholder3(householder, &temp, householderMatrix); // Apply initial Householder transform to create bulge memset(bulge, 0, 16 * sizeof(value_type_t)); - for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i]; + for (i = 0; i < 4; ++i) + bulge[IDX(i, i, 4)] = alpha[i]; for (i = 0; i < 3; ++i) { bulge[IDX(i + 1, i, 4)] = beta[i]; bulge[IDX(i, i + 1, 4)] = beta[i]; } applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, - 3, 0, work, n); + Lapack::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n); memcpy(V, work, 3 * n * sizeof(value_type_t)); // Chase bulge to bottom-right of matrix with Householder transforms for (pos = 0; pos < n - 4; ++pos) { // Move to next position - alpha[pos] = bulge[IDX(0, 0, 4)]; + alpha[pos] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = beta[pos + 3]; @@ -385,22 +455,22 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(3, 3, 4)] = alpha[pos + 4]; // Apply Householder transform - findHouseholder3(householder, beta + pos, - householderMatrix); + findHouseholder3(householder, beta + pos, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), - n, householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t)); } // Apply penultimate Householder transform // Values in the last row and column are zero - alpha[n - 4] = bulge[IDX(0, 0, 4)]; + alpha[n - 4] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = bulge[IDX(3, 0, 4)]; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; bulge[IDX(3, 0, 4)] = 0; bulge[IDX(3, 1, 4)] = 0; bulge[IDX(3, 2, 4)] = 0; @@ -408,37 +478,36 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, bulge[IDX(1, 3, 4)] = 0; bulge[IDX(2, 3, 4)] = 0; bulge[IDX(3, 3, 4)] = 0; - findHouseholder3(householder, beta + n - 4, - householderMatrix); + findHouseholder3(householder, beta + n - 4, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t)); // Apply final Householder transform // Values in the last two rows and columns are zero - alpha[n - 3] = bulge[IDX(0, 0, 4)]; + alpha[n - 3] = bulge[IDX(0, 0, 4)]; householder[0] = bulge[IDX(1, 0, 4)]; householder[1] = bulge[IDX(2, 0, 4)]; householder[2] = 0; for (j = 0; j < 3; ++j) - for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; - findHouseholder3(householder, beta + n - 3, - householderMatrix); + for (i = 0; i < 3; ++i) + bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)]; + findHouseholder3(householder, beta + n - 3, householderMatrix); applyHouseholder3(householder, bulge); - Lapack::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, - householderMatrix, 3, 0, work, n); + Lapack::gemm( + false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n); memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t)); // Bulge has been eliminated alpha[n - 2] = bulge[IDX(0, 0, 4)]; alpha[n - 1] = bulge[IDX(1, 1, 4)]; - beta[n - 2] = bulge[IDX(1, 0, 4)]; + beta[n - 2] = bulge[IDX(1, 0, 4)]; return 0; } -/** +/** * @brief Perform implicit restart of Lanczos algorithm * Shifts are Chebyshev nodes of unwanted region of matrix spectrum. * @tparam index_type_t the type of data used for indexing. @@ -474,23 +543,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1, * @return error flag. */ template -static int lanczosRestart( - handle_t const &handle, index_type_t n, index_type_t iter, - index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, bool smallest_eig) { +static int lanczosRestart(handle_t const& handle, + index_type_t n, + index_type_t iter, + index_type_t iter_new, + value_type_t* shiftUpper, + value_type_t* shiftLower, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ V_host, + value_type_t* __restrict__ work_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + bool smallest_eig) +{ // ------------------------------------------------------- // Variable declaration // ------------------------------------------------------- // Useful constants constexpr value_type_t zero = 0; - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Loop index index_type_t i; @@ -501,12 +577,12 @@ static int lanczosRestart( index_type_t restartSteps = iter - iter_new; // Ritz values from Lanczos method - value_type_t *ritzVals_host = work_host + 3 * iter; + value_type_t* ritzVals_host = work_host + 3 * iter; // Shifts for implicit restart - value_type_t *shifts_host; + value_type_t* shifts_host; // Orthonormal matrix for similarity transform - value_type_t *V_dev = work_dev + n * iter; + value_type_t* V_dev = work_dev + n * iter; // ------------------------------------------------------- // Implementation @@ -524,7 +600,8 @@ static int lanczosRestart( // Initialize similarity transform with identity matrix memset(V_host, 0, iter * iter * sizeof(value_type_t)); - for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1; + for (i = 0; i < iter; ++i) + V_host[IDX(i, i, iter)] = 1; // Determine interval to suppress eigenvalues if (smallest_eig) { @@ -548,49 +625,71 @@ static int lanczosRestart( // Calculate Chebyshev nodes as shifts shifts_host = ritzVals_host; for (i = 0; i < restartSteps; ++i) { - shifts_host[i] = - cos((i + 0.5) * static_cast(M_PI) / restartSteps); + shifts_host[i] = cos((i + 0.5) * static_cast(M_PI) / restartSteps); shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower)); shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower)); } // Apply Francis QR algorithm to implicitly restart Lanczos for (i = 0; i < restartSteps; i += 2) - if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host, - beta_host, V_host, work_host)) + if (francisQRIteration( + iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host)) WARNING("error in implicitly shifted QR algorithm"); // Obtain new residual - CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); - - beta_host[iter - 1] = - beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; - CUBLAS_CHECK(cublasgemv( - cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev, - n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1, - lanczosVecs_dev + IDX(0, iter, n), 1, stream)); + CUDA_TRY(cudaMemcpyAsync( + V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + + beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)]; + CUBLAS_CHECK(cublasgemv(cublas_h, + CUBLAS_OP_N, + n, + iter, + beta_host + iter_new - 1, + lanczosVecs_dev, + n, + V_dev + IDX(0, iter_new, iter), + 1, + beta_host + iter - 1, + lanczosVecs_dev + IDX(0, iter, n), + 1, + stream)); // Obtain new Lanczos vectors - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter, - &one, lanczosVecs_dev, n, V_dev, iter, &zero, - work_dev, n, stream)); - - CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev, + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + iter_new, + iter, + &one, + lanczosVecs_dev, + n, + V_dev, + iter, + &zero, + work_dev, + n, + stream)); + + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, + work_dev, n * iter_new * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); // Normalize residual to obtain new Lanczos vector - CUDA_TRY(cudaMemcpyAsync( - lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n), - n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n), + lanczosVecs_dev + IDX(0, iter, n), + n * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, + stream)); - CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, - beta_host + iter_new - 1, stream)); + CUBLAS_CHECK(cublasnrm2( + cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream)); auto h_beta = 1 / beta_host[iter_new - 1]; - CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, - lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); + CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream)); return 0; } @@ -601,7 +700,7 @@ static int lanczosRestart( // Eigensolver // ========================================================= -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -651,19 +750,28 @@ static int lanczosRestart( * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *shift, - value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* shift, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -683,21 +791,20 @@ int computeSmallestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -710,12 +817,11 @@ int computeSmallestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue to determine shift @@ -738,10 +844,18 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - *shift = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + *shift = 0; + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0.0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); // Determine largest eigenvalue @@ -756,9 +870,17 @@ int computeSmallestEigenvectors( // Obtain tridiagonal matrix with Lanczos *effIter = 0; - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -775,9 +897,19 @@ int computeSmallestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + true); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -786,9 +918,17 @@ int computeSmallestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -799,39 +939,59 @@ int computeSmallestEigenvectors( } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; - for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; + for (i = *effIter; i < nEigVecs; ++i) + work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory - CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter), + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter), nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); - CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host, + CUDA_TRY(cudaMemcpyAsync(work_dev, + Z_host, (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute smallest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -869,20 +1029,25 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeSmallestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) { +int computeSmallestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 1234567) +{ using namespace spectral; // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -892,8 +1057,8 @@ int computeSmallestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); @@ -901,10 +1066,23 @@ int computeSmallestEigenvectors( // Perform Lanczos method index_type_t effIter; value_type_t shift; - int status = computeSmallestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = computeSmallestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + &shift, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; @@ -914,7 +1092,7 @@ int computeSmallestEigenvectors( // Eigensolver // ========================================================= -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -959,19 +1137,27 @@ int computeSmallestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const *A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t *effIter, - index_type_t *totalIter, value_type_t *__restrict__ alpha_host, - value_type_t *__restrict__ beta_host, - value_type_t *__restrict__ lanczosVecs_dev, - value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const* A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t* effIter, + index_type_t* totalIter, + value_type_t* __restrict__ alpha_host, + value_type_t* __restrict__ beta_host, + value_type_t* __restrict__ lanczosVecs_dev, + value_type_t* __restrict__ work_dev, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed) +{ using namespace spectral; // Useful constants - constexpr value_type_t one = 1; + constexpr value_type_t one = 1; constexpr value_type_t zero = 0; // Matrix dimension @@ -987,8 +1173,8 @@ int computeLargestEigenvectors( index_type_t i; // Host memory - value_type_t *Z_host; // Eigenvectors in Lanczos basis - value_type_t *work_host; // Workspace + value_type_t* Z_host; // Eigenvectors in Lanczos basis + value_type_t* work_host; // Workspace // ------------------------------------------------------- // Check that LAPACK is enabled @@ -998,15 +1184,14 @@ int computeLargestEigenvectors( // ------------------------------------------------------- // Check that parameters are valid // ------------------------------------------------------- - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter."); auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // ------------------------------------------------------- // Variable initialization @@ -1019,12 +1204,11 @@ int computeLargestEigenvectors( std::vector Z_host_v(restartIter * restartIter); std::vector work_host_v(4 * restartIter); - Z_host = Z_host_v.data(); + Z_host = Z_host_v.data(); work_host = work_host_v.data(); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // Compute largest eigenvalue @@ -1044,13 +1228,21 @@ int computeLargestEigenvectors( CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream)); // Obtain tridiagonal matrix with Lanczos - *effIter = 0; + *effIter = 0; value_type_t shift_val = 0.0; - value_type_t *shift = &shift_val; - - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host, - beta_host, lanczosVecs_dev, work_dev); + value_type_t* shift = &shift_val; + + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + 0, + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter; @@ -1067,9 +1259,19 @@ int computeLargestEigenvectors( if (iter_new == *effIter) break; // Implicit restart of Lanczos method - status = lanczosRestart( - handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host, - beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false); + status = lanczosRestart(handle, + n, + *effIter, + iter_new, + &shiftUpper, + &shiftLower, + alpha_host, + beta_host, + Z_host, + work_host, + lanczosVecs_dev, + work_dev, + false); if (status) WARNING("error in Lanczos implicit restart"); *effIter = iter_new; @@ -1078,9 +1280,17 @@ int computeLargestEigenvectors( // Proceed with Lanczos method - status = performLanczosIteration( - handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower), - reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev); + status = performLanczosIteration(handle, + A, + effIter, + maxIter_curr, + *shift, + tol * fabs(shiftLower), + reorthogonalize, + alpha_host, + beta_host, + lanczosVecs_dev, + work_dev); if (status) WARNING("error in Lanczos iteration"); *totalIter += *effIter - iter_new; } @@ -1090,15 +1300,18 @@ int computeLargestEigenvectors( WARNING("implicitly restarted Lanczos failed to converge"); } for (int i = 0; i < restartIter; ++i) { - for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0; + for (int j = 0; j < restartIter; ++j) + Z_host[i * restartIter + j] = 0; } // Solve tridiagonal system - memcpy(work_host + 2 * (*effIter), alpha_host, - (*effIter) * sizeof(value_type_t)); - memcpy(work_host + 3 * (*effIter), beta_host, - (*effIter - 1) * sizeof(value_type_t)); - Lapack::steqr('I', *effIter, work_host + 2 * (*effIter), - work_host + 3 * (*effIter), Z_host, *effIter, + memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t)); + memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t)); + Lapack::steqr('I', + *effIter, + work_host + 2 * (*effIter), + work_host + 3 * (*effIter), + Z_host, + *effIter, work_host); // note: We need to pick the top nEigVecs eigenvalues @@ -1123,36 +1336,52 @@ int computeLargestEigenvectors( //} // Obtain desired eigenvalues by applying shift - for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift; + for (i = 0; i < *effIter; ++i) + work_host[i + 2 * (*effIter)] -= *shift; for (i = 0; i < top_eigenparis_idx_offset; ++i) work_host[i + 2 * (*effIter)] = 0; // Copy results to device memory // skip smallest eigenvalue if needed - CUDA_TRY(cudaMemcpyAsync( - eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset, - nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream)); + CUDA_TRY(cudaMemcpyAsync(eigVals_dev, + work_host + 2 * (*effIter) + top_eigenparis_idx_offset, + nEigVecs * sizeof(value_type_t), + cudaMemcpyHostToDevice, + stream)); // skip smallest eigenvector if needed CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host + (top_eigenparis_idx_offset * (*effIter)), (*effIter) * nEigVecs * sizeof(value_type_t), - cudaMemcpyHostToDevice, stream)); + cudaMemcpyHostToDevice, + stream)); CHECK_CUDA(stream); // Convert eigenvectors from Lanczos basis to standard basis - CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs, - *effIter, &one, lanczosVecs_dev, n, work_dev, - *effIter, &zero, eigVecs_dev, n, stream)); + CUBLAS_CHECK(cublasgemm(cublas_h, + CUBLAS_OP_N, + CUBLAS_OP_N, + n, + nEigVecs, + *effIter, + &one, + lanczosVecs_dev, + n, + work_dev, + *effIter, + &zero, + eigVecs_dev, + n, + stream)); // Clean up and exit curandDestroyGenerator(randGen); return 0; } -/** +/** * @brief Compute largest eigenvectors of symmetric matrix * Computes eigenvalues and eigenvectors that are least * positive. If matrix is positive definite or positive @@ -1190,18 +1419,23 @@ int computeLargestEigenvectors( * @return error flag. */ template -int computeLargestEigenvectors( - handle_t const &handle, sparse_matrix_t const &A, - index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter, - value_type_t tol, bool reorthogonalize, index_type_t &iter, - value_type_t *__restrict__ eigVals_dev, - value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) { +int computeLargestEigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + index_type_t nEigVecs, + index_type_t maxIter, + index_type_t restartIter, + value_type_t tol, + bool reorthogonalize, + index_type_t& iter, + value_type_t* __restrict__ eigVals_dev, + value_type_t* __restrict__ eigVecs_dev, + unsigned long long seed = 123456) +{ // Matrix dimension index_type_t n = A.nrows_; // Check that parameters are valid - RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, - "Invalid number of eigenvectors."); + RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors."); RAFT_EXPECTS(restartIter > 0, "Invalid restartIter."); RAFT_EXPECTS(tol > 0, "Invalid tolerance."); RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter."); @@ -1211,18 +1445,30 @@ int computeLargestEigenvectors( std::vector alpha_host_v(restartIter); std::vector beta_host_v(restartIter); - value_type_t *alpha_host = alpha_host_v.data(); - value_type_t *beta_host = beta_host_v.data(); + value_type_t* alpha_host = alpha_host_v.data(); + value_type_t* beta_host = beta_host_v.data(); vector_t lanczosVecs_dev(handle, n * (restartIter + 1)); vector_t work_dev(handle, (n + restartIter) * restartIter); // Perform Lanczos method index_type_t effIter; - int status = computeLargestEigenvectors( - handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter, - &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(), - eigVals_dev, eigVecs_dev, seed); + int status = computeLargestEigenvectors(handle, + &A, + nEigVecs, + maxIter, + restartIter, + tol, + reorthogonalize, + &effIter, + &iter, + alpha_host, + beta_host, + lanczosVecs_dev.raw(), + work_dev.raw(), + eigVals_dev, + eigVecs_dev, + seed); // Clean up and return return status; diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index aff08da2d3..200818fdc3 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -24,21 +24,18 @@ namespace raft { namespace linalg { -template -__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in, - Args... args) { +template +__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args) +{ auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - out[idx] = map(in[idx], args[idx]...); - } + if (idx < len) { out[idx] = map(in[idx], args[idx]...); } } -template -void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { +template +void mapImpl( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ const int nblks = raft::ceildiv(len, (size_t)TPB); mapKernel <<>>(out, len, map, in, args...); @@ -60,12 +57,14 @@ void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void map(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { - mapImpl(out, len, map, stream, in, - args...); +void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + mapImpl(out, len, map, stream, in, args...); } } // namespace linalg diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh index f2f198670a..78a7017c5c 100644 --- a/cpp/include/raft/linalg/map_then_reduce.cuh +++ b/cpp/include/raft/linalg/map_then_reduce.cuh @@ -24,50 +24,66 @@ namespace raft { namespace linalg { -struct sum_tag {}; +struct sum_tag { +}; template -__device__ void reduce(OutType *out, const InType acc, sum_tag) { +__device__ void reduce(OutType* out, const InType acc, sum_tag) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Sum(acc); - if (threadIdx.x == 0) { - raft::myAtomicAdd(out, tmp); - } + if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); } } template -__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) { +__device__ void reduce(OutType* out, const InType acc, ReduceLambda op) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; OutType tmp = BlockReduce(temp_storage).Reduce(acc, op); - if (threadIdx.x == 0) { - raft::myAtomicReduce(out, tmp, op); - } + if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); } } -template -__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral, - MapOp map, ReduceLambda op, - const InType *in, Args... args) { +template +__global__ void mapThenReduceKernel(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + const InType* in, + Args... args) +{ OutType acc = neutral; - auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (idx < len) { - acc = map(in[idx], args[idx]...); - } + if (idx < len) { acc = map(in[idx], args[idx]...); } __syncthreads(); reduce(out, acc, op); } -template -void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduceImpl(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ raft::update_device(out, &neutral, 1, stream); const int nblks = raft::ceildiv(len, (size_t)TPB); mapThenReduceKernel @@ -89,10 +105,14 @@ void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map, * @param args additional input arrays */ -template -void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, - const InType *in, Args... args) { +void mapThenSumReduce( + OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ mapThenReduceImpl( out, len, (OutType)0, map, sum_tag(), stream, in, args...); } @@ -115,11 +135,21 @@ void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream, * @param args additional input arrays */ -template -void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map, - ReduceLambda op, cudaStream_t stream, const InType *in, - Args... args) { +template +void mapThenReduce(OutType* out, + size_t len, + OutType neutral, + MapOp map, + ReduceLambda op, + cudaStream_t stream, + const InType* in, + Args... args) +{ mapThenReduceImpl( out, len, neutral, map, op, stream, in, args...); } diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 93f2d746fa..81c1919b2e 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -27,19 +27,24 @@ namespace { template struct AlignedAccess { template - static inline bool test(const T *matrix, size_t strideBytes) { - return Pow2::isAligned(matrix) && - Pow2::isAligned(strideBytes) && + static inline bool test(const T* matrix, size_t strideBytes) + { + return Pow2::isAligned(matrix) && Pow2::isAligned(strideBytes) && Pow2::isAligned(VecBytes); } }; }; // namespace template -__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, - Lambda op) { +__global__ void matrixVectorOpKernel(Type* out, + const Type* matrix, + const Type* vector, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op) +{ typedef TxN_t VecType; IdxType len = N * D; IdxType idx = threadIdx.x; @@ -70,17 +75,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - IdxType len = N * D; - IdxType nblks = - raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); +template +void matrixVectorOpImpl(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType len = N * D; + IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec, D, N, rowMajor, - bcastAlongRows, op); + <<>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -108,10 +117,17 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, - cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType stride = rowMajor ? D : N; size_t stride_bytes = stride * sizeof(Type); if (AlignedAccess<16>::test(matrix, stride_bytes)) { @@ -138,10 +154,16 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, ///@todo: come up with a cleaner interface to support these cases in future! template -__global__ void matrixVectorOpKernel(Type *out, const Type *matrix, - const Type *vector1, const Type *vector2, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op) { +__global__ void matrixVectorOpKernel(Type* out, + const Type* matrix, + const Type* vector1, + const Type* vector2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op) +{ typedef TxN_t VecType; IdxType len = N * D; IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; @@ -174,15 +196,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, mat.store(out, idx); } -template -void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { +template +void matrixVectorOpImpl(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB); matrixVectorOpKernel - <<>>(out, matrix, vec1, vec2, D, N, rowMajor, - bcastAlongRows, op); + <<>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -211,10 +239,18 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, * @param stream cuda stream where to launch work */ template -void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Lambda op, cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; +void matrixVectorOp(Type* out, + const Type* matrix, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Lambda op, + cudaStream_t stream) +{ + IdxType stride = rowMajor ? D : N; size_t stride_bytes = stride * sizeof(Type); if (AlignedAccess<16>::test(matrix, stride_bytes)) { diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh index 9d1538c172..a3fcc5bac6 100644 --- a/cpp/include/raft/linalg/mean_squared_error.cuh +++ b/cpp/include/raft/linalg/mean_squared_error.cuh @@ -24,7 +24,7 @@ namespace linalg { /** * @brief CUDA version mean squared error function mean((A-B)**2) * @tparam math_t data-type upon which the math operation will be performed - * @tparam TPB threads-per-block + * @tparam TPB threads-per-block * @param out the output mean squared error value (assumed to be a device pointer) * @param A input array (assumed to be a device pointer) * @param B input array (assumed to be a device pointer) @@ -33,14 +33,14 @@ namespace linalg { * @param stream cuda-stream where to launch this kernel */ template -void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len, - math_t weight, cudaStream_t stream) { +void meanSquaredError( + math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream) +{ auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) { math_t diff = a - b; return diff * diff * weight / len; }; - mapThenSumReduce(out, len, sq_diff, stream, A, - B); + mapThenSumReduce(out, len, sq_diff, stream, A, B); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh index ce948c927d..53d57ecd00 100644 --- a/cpp/include/raft/linalg/multiply.cuh +++ b/cpp/include/raft/linalg/multiply.cuh @@ -33,11 +33,10 @@ namespace linalg { * @{ */ template -void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len, - cudaStream_t stream) { +void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream) +{ unaryOp( - out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, - stream); + out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream); } /** @} */ diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh index 64930a7123..82558c8023 100644 --- a/cpp/include/raft/linalg/norm.cuh +++ b/cpp/include/raft/linalg/norm.cuh @@ -44,22 +44,46 @@ enum NormType { L1Norm = 0, L2Norm }; * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, - bool rowMajor, cudaStream_t stream, - Lambda fin_op = raft::Nop()) { +template > +void rowNorm(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op = raft::Nop()) +{ switch (type) { case L1Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, - raft::L1Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false, - raft::L2Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + true, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; - default: - ASSERT(false, "Invalid norm type passed! [%d]", type); + default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; } @@ -77,22 +101,46 @@ void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, * @param stream cuda stream where to launch work * @param fin_op the final lambda op */ -template > -void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type, - bool rowMajor, cudaStream_t stream, - Lambda fin_op = raft::Nop()) { +template > +void colNorm(Type* dots, + const Type* data, + IdxType D, + IdxType N, + NormType type, + bool rowMajor, + cudaStream_t stream, + Lambda fin_op = raft::Nop()) +{ switch (type) { case L1Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, - raft::L1Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L1Op(), + raft::Sum(), + fin_op); break; case L2Norm: - reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false, - raft::L2Op(), raft::Sum(), fin_op); + reduce(dots, + data, + D, + N, + (Type)0, + rowMajor, + false, + stream, + false, + raft::L2Op(), + raft::Sum(), + fin_op); break; - default: - ASSERT(false, "Invalid norm type passed! [%d]", type); + default: ASSERT(false, "Invalid norm type passed! [%d]", type); }; } diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index a50448acbe..c85cfda934 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -41,14 +41,18 @@ namespace linalg { * @{ */ template -void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, - int n_rows, int n_cols, cudaStream_t stream) { +void qrGetQ(const raft::handle_t& handle, + const math_t* M, + math_t* Q, + int n_rows, + int n_cols, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; int k = min(m, n); - CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); rmm::device_uvector tau(k, stream); CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream)); @@ -58,19 +62,16 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork)); rmm::device_uvector workspace(Lwork, stream); - CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDngeqrf( + cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); #endif - CUSOLVER_CHECK( - cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(), - workspace.data(), Lwork, devInfo.data(), - stream)); + CUSOLVER_CHECK(cusolverDnorgqr( + cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); } /** @@ -84,29 +85,40 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q, * @param stream cuda stream */ template -void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, - int n_rows, int n_cols, cudaStream_t stream) { +void qrGetQR(const raft::handle_t& handle, + math_t* M, + math_t* Q, + math_t* R, + int n_rows, + int n_cols, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); int m = n_rows, n = n_cols; rmm::device_uvector R_full(m * n, stream); rmm::device_uvector tau(min(m, n), stream); - CUDA_CHECK( - cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); + CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream)); int R_full_nrows = m, R_full_ncols = n; - CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Lwork; rmm::device_scalar devInfo(stream); - CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows, - R_full_ncols, R_full.data(), - R_full_nrows, &Lwork)); + CUSOLVER_CHECK(cusolverDngeqrf_bufferSize( + cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); rmm::device_uvector workspace(Lwork, stream); - CUSOLVER_CHECK(cusolverDngeqrf( - cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, - tau.data(), workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, + R_full_nrows, + R_full_ncols, + R_full.data(), + R_full_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail. #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020 CUDA_CHECK(cudaDeviceSynchronize()); @@ -114,17 +126,24 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R, raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream); - CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); int Q_nrows = m, Q_ncols = n; - CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols, - min(Q_ncols, Q_nrows), Q, Q_nrows, - tau.data(), &Lwork)); + CUSOLVER_CHECK(cusolverDnorgqr_bufferSize( + cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork)); workspace.resize(Lwork, stream); - CUSOLVER_CHECK(cusolverDnorgqr( - cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), - workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, + Q_nrows, + Q_ncols, + min(Q_ncols, Q_nrows), + Q, + Q_nrows, + tau.data(), + workspace.data(), + Lwork, + devInfo.data(), + stream)); } /** @} */ diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index d39577bbdd..693a797db9 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -52,28 +52,33 @@ namespace linalg { * @param reduce_op binary reduction operation * @param final_op elementwise operation to apply before storing results */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void reduce(OutType *dots, const InType *data, int D, int N, OutType init, - bool rowMajor, bool alongRows, cudaStream_t stream, - bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void reduce(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + bool rowMajor, + bool alongRows, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ if (rowMajor && alongRows) { - coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (rowMajor && !alongRows) { - stridedReduction(dots, data, D, N, init, stream, inplace, main_op, - reduce_op, final_op); + stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (!rowMajor && alongRows) { - stridedReduction(dots, data, N, D, init, stream, inplace, main_op, - reduce_op, final_op); + stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } else { - coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, - reduce_op, final_op); + coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } } diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh index bba652e137..f931c976fd 100644 --- a/cpp/include/raft/linalg/strided_reduction.cuh +++ b/cpp/include/raft/linalg/strided_reduction.cuh @@ -28,14 +28,15 @@ namespace linalg { // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout template -__global__ void stridedSummationKernel(Type *dots, const Type *data, int D, - int N, Type init, MainLambda main_op) { +__global__ void stridedSummationKernel( + Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op) +{ // Thread reduction Type thread_data = Type(init); - int colStart = blockIdx.x * blockDim.x + threadIdx.x; + int colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { int rowStart = blockIdx.y * blockDim.y + threadIdx.y; - int stride = blockDim.y * gridDim.y; + int stride = blockDim.y * gridDim.y; for (int j = rowStart; j < N; j += stride) { int idx = colStart + j * D; thread_data += main_op(data[idx], j); @@ -44,8 +45,8 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D, // Block reduction extern __shared__ char tmp[]; // One element per thread in block - Type *temp = (Type *)tmp; // Cast to desired type - int myidx = threadIdx.x + blockDim.x * threadIdx.y; + Type* temp = (Type*)tmp; // Cast to desired type + int myidx = threadIdx.x + blockDim.x * threadIdx.y; temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { @@ -54,24 +55,31 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D, } // Grid reduction - if ((colStart < D) && (threadIdx.y == 0)) - raft::myAtomicAdd(dots + colStart, temp[myidx]); + if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]); } // Kernel to perform reductions along the strided dimension // of the matrix, i.e. reduce along columns for row major or reduce along rows // for column major layout -template -__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, - int N, OutType init, MainLambda main_op, - ReduceLambda reduce_op) { +template +__global__ void stridedReductionKernel(OutType* dots, + const InType* data, + int D, + int N, + OutType init, + MainLambda main_op, + ReduceLambda reduce_op) +{ // Thread reduction OutType thread_data = init; - IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; + IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y; - IdxType stride = blockDim.y * gridDim.y; + IdxType stride = blockDim.y * gridDim.y; for (IdxType j = rowStart; j < N; j += stride) { IdxType idx = colStart + j * D; thread_data = reduce_op(thread_data, main_op(data[idx], j)); @@ -79,14 +87,13 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, } // Block reduction - extern __shared__ char tmp[]; // One element per thread in block - auto *temp = (OutType *)tmp; // Cast to desired type + extern __shared__ char tmp[]; // One element per thread in block + auto* temp = (OutType*)tmp; // Cast to desired type IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y); - temp[myidx] = thread_data; + temp[myidx] = thread_data; __syncthreads(); for (int j = blockDim.y / 2; j > 0; j /= 2) { - if (threadIdx.y < j) - temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); + if (threadIdx.y < j) temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]); __syncthreads(); } @@ -122,15 +129,23 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D, * @param inplace reduction result added inplace or overwrites old values? * @param stream cuda stream where to launch work */ -template , +template , typename ReduceLambda = raft::Sum, - typename FinalLambda = raft::Nop> -void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, - OutType init, cudaStream_t stream, bool inplace = false, - MainLambda main_op = raft::Nop(), + typename FinalLambda = raft::Nop> +void stridedReduction(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + cudaStream_t stream, + bool inplace = false, + MainLambda main_op = raft::Nop(), ReduceLambda reduce_op = raft::Sum(), - FinalLambda final_op = raft::Nop()) { + FinalLambda final_op = raft::Nop()) +{ ///@todo: this extra should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) if (!inplace) @@ -140,7 +155,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, // Arbitrary numbers for now, probably need to tune const dim3 thrds(32, 16); IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); - elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; + elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; @@ -153,8 +168,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N, <<>>(dots, data, D, N, init, main_op); else stridedReductionKernel - <<>>(dots, data, D, N, init, main_op, - reduce_op); + <<>>(dots, data, D, N, init, main_op, reduce_op); ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh index 882c105689..43060d0818 100644 --- a/cpp/include/raft/linalg/subtract.cuh +++ b/cpp/include/raft/linalg/subtract.cuh @@ -38,8 +38,8 @@ namespace linalg { * @param stream cuda stream where to launch work */ template -void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, - cudaStream_t stream) { +void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream) +{ auto op = [scalar] __device__(InT in) { return OutT(in - scalar); }; unaryOp(out, in, len, op, stream); } @@ -58,24 +58,25 @@ void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len, * @param stream cuda stream where to launch work */ template -void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len, - cudaStream_t stream) { +void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream) +{ auto op = [] __device__(InT a, InT b) { return OutT(a - b); }; binaryOp(out, in1, in2, len, op, stream); } template -__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, - IdxType len) { - //TODO: kernel do not use shared memory in current implementation +__global__ void subtract_dev_scalar_kernel(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len) +{ + // TODO: kernel do not use shared memory in current implementation int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x; - if (i < len) { - outDev[i] = inDev[i] - *singleScalarDev; - } + if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; } } -/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i] +/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and + * write result to outDev[i] * @tparam math_t data-type upon which the math operation will be performed * @tparam IdxType Integer type used to for addressing * @param outDev the output buffer @@ -86,9 +87,12 @@ __global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev, * @remark block size has not been tuned */ template -void subtractDevScalar(math_t *outDev, const math_t *inDev, - const math_t *singleScalarDev, IdxType len, - cudaStream_t stream) { +void subtractDevScalar(math_t* outDev, + const math_t* inDev, + const math_t* singleScalarDev, + IdxType len, + cudaStream_t stream) +{ // Just for the note - there is no way to express such operation with cuBLAS in effective way // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda const IdxType nblks = raft::ceildiv(len, (IdxType)TPB); diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index 2315920689..e14a5b6a50 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -51,12 +51,20 @@ namespace linalg { // TODO: couldn't template this function due to cusolverDnSgesvd and // cusolverSnSgesvd. Check if there is any other way. template -void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, - T *sing_vals, T *left_sing_vecs, T *right_sing_vecs, - bool trans_right, bool gen_left_vec, bool gen_right_vec, - cudaStream_t stream) { +void svdQR(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* sing_vals, + T* left_sing_vecs, + T* right_sing_vecs, + bool trans_right, + bool gen_left_vec, + bool gen_right_vec, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000 // 46340: sqrt of max int value @@ -71,14 +79,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, const int n = n_cols; rmm::device_scalar devInfo(stream); - T *d_rwork = nullptr; + T* d_rwork = nullptr; int lwork = 0; - CUSOLVER_CHECK( - cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); + CUSOLVER_CHECK(cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); rmm::device_uvector d_work(lwork, stream); - char jobu = 'S'; + char jobu = 'S'; char jobvt = 'A'; if (!gen_left_vec) { @@ -91,9 +98,23 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, strcpy(&jobvt, &new_vt); } - CUSOLVER_CHECK(cusolverDngesvd( - cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m, - right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream)); + CUSOLVER_CHECK(cusolverDngesvd(cusolverH, + jobu, + jobvt, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + d_rwork, + devInfo.data(), + stream)); // Transpose the right singular vector back if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream); @@ -109,18 +130,36 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols, } template -void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, - T *U, T *V, bool gen_left_vec, cudaStream_t stream) { +void svdEig(const raft::handle_t& handle, + T* in, + int n_rows, + int n_cols, + T* S, + T* U, + T* V, + bool gen_left_vec, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); - cublasHandle_t cublasH = handle.get_cublas_handle(); + cublasHandle_t cublasH = handle.get_cublas_handle(); int len = n_cols * n_cols; rmm::device_uvector in_cross_mult(len, stream); T alpha = T(1); - T beta = T(0); - raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(), - n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, + T beta = T(0); + raft::linalg::gemm(handle, + in, + n_rows, + n_cols, + in, + in_cross_mult.data(), + n_cols, + n_cols, + CUBLAS_OP_T, + CUBLAS_OP_N, + alpha, + beta, stream); eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream); @@ -131,10 +170,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true); if (gen_left_vec) { - raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols, - CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); - raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, - true, stream); + raft::linalg::gemm(handle, + in, + n_rows, + n_cols, + V, + U, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); + raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, true, stream); } } @@ -156,10 +205,19 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S, * @param stream cuda stream */ template -void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, - math_t *sing_vals, math_t *left_sing_vecs, - math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec, - math_t tol, int max_sweeps, cudaStream_t stream) { +void svdJacobi(const raft::handle_t& handle, + math_t* in, + int n_rows, + int n_cols, + math_t* sing_vals, + math_t* left_sing_vecs, + math_t* right_sing_vecs, + bool gen_left_vec, + bool gen_right_vec, + math_t tol, + int max_sweeps, + cudaStream_t stream) +{ cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle(); gesvdjInfo_t gesvdj_params = NULL; @@ -174,18 +232,42 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, rmm::device_scalar devInfo(stream); int lwork = 0; - int econ = 1; - - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params)); + int econ = 1; + + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + &lwork, + gesvdj_params)); rmm::device_uvector d_work(lwork, stream); - CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj( - cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals, - left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(), - gesvdj_params, stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH, + CUSOLVER_EIG_MODE_VECTOR, + econ, + m, + n, + in, + m, + sing_vals, + left_sing_vecs, + m, + right_sing_vecs, + n, + d_work.data(), + lwork, + devInfo.data(), + gesvdj_params, + stream)); CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); } @@ -204,16 +286,34 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols, * @param stream cuda stream */ template -void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, - math_t *V, math_t *out, int n_rows, int n_cols, int k, - cudaStream_t stream) { +void svdReconstruction(const raft::handle_t& handle, + math_t* U, + math_t* S, + math_t* V, + math_t* out, + int n_rows, + int n_cols, + int k, + cudaStream_t stream) +{ const math_t alpha = 1.0, beta = 0.0; rmm::device_uvector SVT(k * n_cols, stream); - raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, - CUBLAS_OP_T, alpha, beta, stream); - raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols, - CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream); + raft::linalg::gemm( + handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream); + raft::linalg::gemm(handle, + U, + n_rows, + k, + SVT.data(), + out, + n_rows, + n_cols, + CUBLAS_OP_N, + CUBLAS_OP_N, + alpha, + beta, + stream); } /** @@ -231,9 +331,17 @@ void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S, * @param stream cuda stream */ template -bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, - math_t *S_vec, math_t *V, int n_rows, int n_cols, - int k, math_t tol, cudaStream_t stream) { +bool evaluateSVDByL2Norm(const raft::handle_t& handle, + math_t* A_d, + math_t* U, + math_t* S_vec, + math_t* V, + int n_rows, + int n_cols, + int k, + math_t tol, + cudaStream_t stream) +{ cublasHandle_t cublasH = handle.get_cublas_handle(); int m = n_rows, n = n_cols; @@ -257,16 +365,25 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U, // calculate percent error const math_t alpha = 1.0, beta = -1.0; rmm::device_uvector A_minus_P(m * n, stream); - CUDA_CHECK( - cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); - - CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, - &alpha, A_d, m, &beta, P_d.data(), m, - A_minus_P.data(), m, stream)); - - math_t norm_A_minus_P = - raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); - math_t percent_error = 100.0 * norm_A_minus_P / normA; + CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); + + CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, + CUBLAS_OP_N, + CUBLAS_OP_N, + m, + n, + &alpha, + A_d, + m, + &beta, + P_d.data(), + m, + A_minus_P.data(), + m, + stream)); + + math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream); + math_t percent_error = 100.0 * norm_A_minus_P / normA; return (percent_error / 100.0 < tol); } diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h index db1cabd694..e84ddd1166 100644 --- a/cpp/include/raft/linalg/transpose.h +++ b/cpp/include/raft/linalg/transpose.h @@ -33,18 +33,34 @@ namespace linalg { * @param stream: cuda stream */ template -void transpose(const raft::handle_t &handle, math_t *in, math_t *out, - int n_rows, int n_cols, cudaStream_t stream) { +void transpose(const raft::handle_t& handle, + math_t* in, + math_t* out, + int n_rows, + int n_cols, + cudaStream_t stream) +{ cublasHandle_t cublas_h = handle.get_cublas_handle(); int out_n_rows = n_cols; int out_n_cols = n_rows; const math_t alpha = 1.0; - const math_t beta = 0.0; - CUBLAS_CHECK(raft::linalg::cublasgeam( - cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in, - n_rows, &beta, out, out_n_rows, out, out_n_rows, stream)); + const math_t beta = 0.0; + CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + out_n_rows, + out_n_cols, + &alpha, + in, + n_rows, + &beta, + out, + out_n_rows, + out, + out_n_rows, + stream)); } /** @@ -54,24 +70,24 @@ void transpose(const raft::handle_t &handle, math_t *in, math_t *out, * @param stream: cuda stream */ template -void transpose(math_t *inout, int n, cudaStream_t stream) { - auto m = n; - auto size = n * n; - auto d_inout = inout; +void transpose(math_t* inout, int n, cudaStream_t stream) +{ + auto m = n; + auto size = n * n; + auto d_inout = inout; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + size, - [=] __device__(int idx) { - int s_row = idx % m; - int s_col = idx / m; - int d_row = s_col; - int d_col = s_row; - if (s_row < s_col) { - auto temp = d_inout[d_col * m + d_row]; - d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; - d_inout[s_col * m + s_row] = temp; - } - }); + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) { + int s_row = idx % m; + int s_col = idx / m; + int d_row = s_col; + int d_col = s_row; + if (s_row < s_col) { + auto temp = d_inout[d_col * m + d_row]; + d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row]; + d_inout[s_col * m + s_row] = temp; + } + }); } }; // end namespace linalg diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index 46b4d296cb..198b9b2b10 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -23,10 +23,9 @@ namespace raft { namespace linalg { -template -__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, - Lambda op) { +template +__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op) +{ typedef TxN_t InVecType; typedef TxN_t OutVecType; InVecType a; @@ -42,12 +41,10 @@ __global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len, b.store(out, idx); } -template -void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - const IdxType nblks = - raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); +template +void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); unaryOpKernel <<>>(out, in, len, op); CUDA_CHECK(cudaPeekAtLastError()); @@ -68,47 +65,38 @@ void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op, * @note Lambda must be a functor with the following signature: * `OutType func(const InType& val);` */ -template -void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op, - cudaStream_t stream) { - if (len <= 0) return; //silently skip in case of 0 length input - constexpr auto maxSize = - sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t inAddr = uint64_t(in); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && - outAddr % 16 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && - outAddr % 8 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && - outAddr % 4 == 0) { - unaryOpImpl( - out, in, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && - outAddr % 2 == 0) { - unaryOpImpl( - out, in, len, op, stream); +template +void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + if (len <= 0) return; // silently skip in case of 0 length input + constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t inAddr = uint64_t(in); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { + unaryOpImpl(out, in, len, op, stream); } else if (1 / maxSize) { - unaryOpImpl( - out, in, len, op, stream); + unaryOpImpl(out, in, len, op, stream); } else { - unaryOpImpl(out, in, len, op, - stream); + unaryOpImpl(out, in, len, op, stream); } } template -__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { +__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - if (idx < len) { - op(out + idx, idx); - } + if (idx < len) { op(out + idx, idx); } } /** @@ -128,14 +116,12 @@ __global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) { * where outLocationOffset will be out + idx. * @param[in] stream cuda stream where to launch work */ -template -void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op, - cudaStream_t stream) { +template +void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +{ if (len <= 0) return; // silently skip in case of 0 length input auto nblks = raft::ceildiv(len, TPB); - writeOnlyUnaryOpKernel - <<>>(out, len, op); + writeOnlyUnaryOpKernel<<>>(out, len, op); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index f79cb397b7..4b56f3986f 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -25,30 +25,29 @@ namespace detail { // Computes the argmax(d_in) column-wise in a DxN matrix template -__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { +__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax) +{ typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(-1, -raft::myInf()); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx])); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); - if (threadIdx.x == 0) { - argmax[blockIdx.x] = maxKV.key; - } + if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; } } template -void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, - cudaStream_t stream) { +void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream) +{ int D = n_rows; int N = n_cols; if (D <= 32) { @@ -67,39 +66,39 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by // flipping the sign if the |max| value for each column is negative. template -__global__ void signFlipKernel(T *d_in, int D, int N) { +__global__ void signFlipKernel(T* d_in, int D, int N) +{ typedef cub::BlockReduce, TPB> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; // compute maxIndex=argMax (with abs()) index for column - using KVP = cub::KeyValuePair; + using KVP = cub::KeyValuePair; int rowStart = blockIdx.x * D; KVP thread_data(0, 0); for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx]))); } auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); // flip column sign if d_in[maxIndex] < 0 __shared__ bool need_sign_flip; - if (threadIdx.x == 0) { - need_sign_flip = d_in[maxKV.key] < T(0); - } + if (threadIdx.x == 0) { need_sign_flip = d_in[maxKV.key] < T(0); } __syncthreads(); if (need_sign_flip) { for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; + int idx = rowStart + i; d_in[idx] = -d_in[idx]; } } } template -void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { - int D = n_rows; - int N = n_cols; +void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) +{ + int D = n_rows; + int N = n_cols; auto data = inout; if (D <= 32) { signFlipKernel<<>>(data, D, N); diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index 8293d01bdb..709570ae56 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -28,29 +28,32 @@ namespace matrix { namespace detail { template -void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, - const idx_array_t *indices, idx_t n_rows_indices, - cudaStream_t stream, bool rowMajor = false) { +void copyRows(const m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + const idx_array_t* indices, + idx_t n_rows_indices, + cudaStream_t stream, + bool rowMajor = false) +{ if (rowMajor) { const idx_t TPB = 256; - cache:: - get_vecs<<>>( - in, n_cols, indices, n_rows_indices, out); + cache::get_vecs<<>>( + in, n_cols, indices, n_rows_indices, out); CUDA_CHECK(cudaPeekAtLastError()); return; } - idx_t size = n_rows_indices * n_cols; + idx_t size = n_rows_indices * n_cols; auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + size, - [=] __device__(idx_t idx) { - idx_t row = idx % n_rows_indices; - idx_t col = idx / n_rows_indices; + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { + idx_t row = idx % n_rows_indices; + idx_t col = idx / n_rows_indices; - out[col * n_rows_indices + row] = - in[col * n_rows + indices[row]]; - }); + out[col * n_rows_indices + row] = in[col * n_rows + indices[row]]; + }); } /** @@ -65,8 +68,9 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, * (1-based) */ template -__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, - idx_t y1, idx_t x2, idx_t y2) { +__global__ void slice( + m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t dm = x2 - x1, dn = y2 - y1; if (idx < dm * dn) { @@ -77,8 +81,16 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, } template -void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, - idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) { +void sliceMatrix(m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + idx_t x1, + idx_t y1, + idx_t x2, + idx_t y2, + cudaStream_t stream) +{ // Slicing dim3 block(64); dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x); @@ -94,21 +106,19 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, * @param k: min(n_rows, n_cols) */ template -__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, - idx_t n_cols, idx_t k) { +__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; idx_t m = n_rows, n = n_cols; if (idx < m * n) { idx_t i = idx % m, j = idx / m; - if (i < k && j < k && j >= i) { - dst[i + j * k] = src[idx]; - } + if (i < k && j < k && j >= i) { dst[i + j * k] = src[idx]; } } } template -void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ idx_t m = n_rows, n = n_cols; idx_t k = min(m, n); dim3 block(64); @@ -125,23 +135,21 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, * @param k: dimensionality */ template -__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, - idx_t n, idx_t k) { +__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < k) { - matrix[idx + idx * m] = vec[idx]; - } + if (idx < k) { matrix[idx + idx * m] = vec[idx]; } } template -void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void initializeDiagonalMatrix( + m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ idx_t k = min(n_rows, n_cols); dim3 block(64); dim3 grid((k + block.x - 1) / block.x); - copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, - n_cols, k); + copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, n_cols, k); } /** @@ -151,15 +159,15 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, * @param len: size of one side of the matrix */ template -__global__ void matrixDiagonalInverse(m_t *in, idx_t len) { +__global__ void matrixDiagonalInverse(m_t* in, idx_t len) +{ idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < len) { - in[idx + idx * len] = 1.0 / in[idx + idx * len]; - } + if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; } } template -void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { +void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) +{ dim3 block(64); dim3 grid((len + block.x - 1) / block.x); matrixDiagonalInverse<<>>(in, len); diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp index e67440019f..df6eb6f489 100644 --- a/cpp/include/raft/matrix/math.hpp +++ b/cpp/include/raft/matrix/math.hpp @@ -43,14 +43,18 @@ namespace matrix { * @param stream cuda stream */ template -void power(math_t *in, math_t *out, math_t scalar, int len, - cudaStream_t stream) { - auto d_src = in; +void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream) +{ + auto d_src = in; auto d_dest = out; raft::linalg::binaryOp( - d_dest, d_src, d_src, len, - [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream); + d_dest, + d_src, + d_src, + len, + [=] __device__(math_t a, math_t b) { return scalar * a * b; }, + stream); } /** @@ -61,7 +65,8 @@ void power(math_t *in, math_t *out, math_t scalar, int len, * @param stream cuda stream */ template -void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { +void power(math_t* inout, math_t scalar, int len, cudaStream_t stream) +{ power(inout, inout, scalar, len, stream); } @@ -72,7 +77,8 @@ void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) { * @param stream cuda stream */ template -void power(math_t *inout, int len, cudaStream_t stream) { +void power(math_t* inout, int len, cudaStream_t stream) +{ math_t scalar = 1.0; power(inout, scalar, len, stream); } @@ -86,7 +92,8 @@ void power(math_t *inout, int len, cudaStream_t stream) { * @{ */ template -void power(math_t *in, math_t *out, int len, cudaStream_t stream) { +void power(math_t* in, math_t* out, int len, cudaStream_t stream) +{ math_t scalar = 1.0; power(in, out, scalar, len, stream); } @@ -103,13 +110,20 @@ void power(math_t *in, math_t *out, int len, cudaStream_t stream) { * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, - cudaStream_t stream, bool set_neg_zero = false) { - auto d_src = in; +void seqRoot(math_t* in, + math_t* out, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool set_neg_zero = false) +{ + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, d_src, len, + d_dest, + d_src, + len, [=] __device__(math_t a) { if (set_neg_zero) { if (a < math_t(0)) { @@ -135,8 +149,9 @@ void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len, * @param set_neg_zero whether to set negative numbers to zero */ template -void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, - bool set_neg_zero = false) { +void seqRoot( + math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false) +{ seqRoot(inout, inout, scalar, len, stream, set_neg_zero); } @@ -150,22 +165,27 @@ void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, * @param stream cuda stream */ template -void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { +void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; seqRoot(in, out, scalar, len, stream); } template -void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) { +void seqRoot(math_t* inout, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; seqRoot(inout, inout, scalar, len, stream); } template -void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, - cudaStream_t stream, math_t thres = 1e-15) { +void setSmallValuesZero( + math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ raft::linalg::unaryOp( - out, in, len, + out, + in, + len, [=] __device__(math_t a) { if (a <= thres && -a <= thres) { return math_t(0); @@ -186,8 +206,8 @@ void setSmallValuesZero(math_t *out, const math_t *in, IdxType len, * @param thres: threshold */ template -void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, - math_t thres = 1e-15) { +void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15) +{ setSmallValuesZero(inout, inout, len, stream, thres); } @@ -205,14 +225,21 @@ void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream, * @{ */ template -void reciprocal(math_t *in, math_t *out, math_t scalar, int len, - cudaStream_t stream, bool setzero = false, - math_t thres = 1e-15) { - auto d_src = in; +void reciprocal(math_t* in, + math_t* out, + math_t scalar, + int len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ + auto d_src = in; auto d_dest = out; raft::linalg::unaryOp( - d_dest, d_src, len, + d_dest, + d_src, + len, [=] __device__(math_t a) { if (setzero) { if (abs(a) <= thres) { @@ -239,8 +266,13 @@ void reciprocal(math_t *in, math_t *out, math_t scalar, int len, * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0) */ template -void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, - bool setzero = false, math_t thres = 1e-15) { +void reciprocal(math_t* inout, + math_t scalar, + IdxType len, + cudaStream_t stream, + bool setzero = false, + math_t thres = 1e-15) +{ reciprocal(inout, inout, scalar, len, stream, setzero, thres); } @@ -253,7 +285,8 @@ void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream, * @param stream cuda stream */ template -void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { +void reciprocal(math_t* inout, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; reciprocal(inout, scalar, len, stream); } @@ -268,14 +301,15 @@ void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) { * @param stream cuda stream */ template -void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) { +void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream) +{ math_t scalar = 1.0; reciprocal(in, out, scalar, len, stream); } template -void setValue(math_t *out, const math_t *in, math_t scalar, int len, - cudaStream_t stream = 0) { +void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0) +{ raft::linalg::unaryOp( out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream); } @@ -290,18 +324,18 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len, * @param stream cuda stream */ template -void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, - cudaStream_t stream) { - auto d_src = src; +void ratio( + const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream) +{ + auto d_src = src; auto d_dest = dest; rmm::device_scalar d_sum(stream); - auto *d_sum_ptr = d_sum.data(); - auto no_op = [] __device__(math_t in) { return in; }; + auto* d_sum_ptr = d_sum.data(); + auto no_op = [] __device__(math_t in) { return in; }; raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src); raft::linalg::unaryOp( - d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, - stream); + d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream); } /** @} */ @@ -315,8 +349,8 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, * @param stream: cuda stream */ template -void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, - cudaStream_t stream) { +void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream) +{ detail::argmax(in, n_rows, n_cols, out, stream); } @@ -329,25 +363,49 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, * @param stream cuda stream */ template -void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { +void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream) +{ detail::signFlip(inout, n_rows, n_cols, stream); } template -void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryMult(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a * b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a * b; }, + stream); } template -void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, - bool bcastAlongRows, cudaStream_t stream) { +void matrixVectorBinaryMultSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (b == Type(0)) return a; @@ -358,22 +416,45 @@ void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row, } template -void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryDiv(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a / b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a / b; }, + stream); } template -void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, - bool bcastAlongRows, cudaStream_t stream, - bool return_zero = false) { +void matrixVectorBinaryDivSkipZero(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream, + bool return_zero = false) +{ if (return_zero) { raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return Type(0); @@ -383,7 +464,13 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, stream); } else { raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, [] __device__(Type a, Type b) { if (raft::myAbs(b) < Type(1e-10)) return a; @@ -395,21 +482,45 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row, } template -void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinaryAdd(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); } template -void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row, - IdxType n_col, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void matrixVectorBinarySub(Type* data, + const Type* vec, + IdxType n_row, + IdxType n_col, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, stream); + data, + data, + vec, + n_col, + n_row, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); } }; // end namespace matrix diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp index 8dd9fbf487..c4cd30b7bc 100644 --- a/cpp/include/raft/matrix/matrix.hpp +++ b/cpp/include/raft/matrix/matrix.hpp @@ -47,11 +47,16 @@ using namespace std; * @param rowMajor whether the matrix has row major layout */ template -void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, - const idx_array_t *indices, idx_t n_rows_indices, - cudaStream_t stream, bool rowMajor = false) { - detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, - rowMajor); +void copyRows(const m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + const idx_array_t* indices, + idx_t n_rows_indices, + cudaStream_t stream, + bool rowMajor = false) +{ + detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, rowMajor); } /** @@ -63,8 +68,8 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, * @param stream: cuda stream */ template -void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ raft::copy_async(out, in, n_rows * n_cols, stream); } @@ -79,21 +84,21 @@ void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols, * @param stream: cuda stream */ template -void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, - idx_t out_n_cols, cudaStream_t stream) { - auto m = out_n_rows; - auto k = in_n_rows; - idx_t size = out_n_rows * out_n_cols; - auto d_q = in; +void truncZeroOrigin( + m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream) +{ + auto m = out_n_rows; + auto k = in_n_rows; + idx_t size = out_n_rows * out_n_cols; + auto d_q = in; auto d_q_trunc = out; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + size, - [=] __device__(idx_t idx) { - idx_t row = idx % m; - idx_t col = idx / m; - d_q_trunc[col * m + row] = d_q[col * k + row]; - }); + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) { + idx_t row = idx % m; + idx_t col = idx / m; + d_q_trunc[col * m + row] = d_q[col * k + row]; + }); } /** @@ -105,24 +110,25 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows, * @param stream: cuda stream */ template -void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto n = n_cols; - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto n = n_cols; + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), - [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = dest_row; - idx_t src_col = (n - dest_col) - 1; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + thrust::for_each( + rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = dest_row; + idx_t src_col = (n - dest_col) - 1; + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -134,25 +140,26 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { * @param stream: cuda stream */ template -void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - auto m = n_rows; - idx_t size = n_rows * n_cols; - auto d_q = inout; +void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ + auto m = n_rows; + idx_t size = n_rows * n_cols; + auto d_q = inout; auto d_q_reversed = inout; - auto counting = thrust::make_counting_iterator(0); + auto counting = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2), - [=] __device__(idx_t idx) { - idx_t dest_row = idx % m; - idx_t dest_col = idx / m; - idx_t src_row = (m - dest_row) - 1; - ; - idx_t src_col = dest_col; + thrust::for_each( + rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) { + idx_t dest_row = idx % m; + idx_t dest_col = idx / m; + idx_t src_row = (m - dest_row) - 1; + ; + idx_t src_col = dest_col; - m_t temp = (m_t)d_q_reversed[idx]; - d_q_reversed[idx] = d_q[src_col * m + src_row]; - d_q[src_col * m + src_row] = temp; - }); + m_t temp = (m_t)d_q_reversed[idx]; + d_q_reversed[idx] = d_q[src_col * m + src_row]; + d_q[src_col * m + src_row] = temp; + }); } /** @@ -164,16 +171,19 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { * @param v_separator: vertical separator character */ template -void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', - char v_separator = '\n', - cudaStream_t stream = rmm::cuda_stream_default) { +void print(const m_t* in, + idx_t n_rows, + idx_t n_cols, + char h_separator = ' ', + char v_separator = '\n', + cudaStream_t stream = rmm::cuda_stream_default) +{ std::vector h_matrix = std::vector(n_cols * n_rows); raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream); for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { - printf("%1.4f%c", h_matrix[j * n_rows + i], - j < n_cols - 1 ? h_separator : v_separator); + printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator); } } } @@ -185,7 +195,8 @@ void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', * @param n_cols: number of columns of input matrix */ template -void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { +void printHost(const m_t* in, idx_t n_rows, idx_t n_cols) +{ for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { printf("%1.4f ", in[j * n_rows + i]); @@ -208,8 +219,16 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { * @param stream: cuda stream */ template -void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, - idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) { +void sliceMatrix(m_t* in, + idx_t n_rows, + idx_t n_cols, + m_t* out, + idx_t x1, + idx_t y1, + idx_t x2, + idx_t y2, + cudaStream_t stream) +{ detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream); } @@ -222,8 +241,8 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, * @param stream: cuda stream */ template -void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream); } @@ -236,8 +255,9 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, * @param stream: cuda stream */ template -void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, - cudaStream_t stream) { +void initializeDiagonalMatrix( + m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) +{ detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream); } @@ -248,7 +268,8 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, * @param stream: cuda stream */ template -void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { +void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) +{ detail::getDiagonalInverseMatrix(in, len, stream); } @@ -260,12 +281,11 @@ void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { * @param stream: cuda stream */ template -m_t getL2Norm(const raft::handle_t &handle, m_t *in, idx_t size, - cudaStream_t stream) { +m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream) +{ cublasHandle_t cublasH = handle.get_cublas_handle(); - m_t normval = 0; - CUBLAS_CHECK( - raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); + m_t normval = 0; + CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream)); return normval; } diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp index 4a2362bf97..38ef59aadf 100644 --- a/cpp/include/raft/mr/buffer_base.hpp +++ b/cpp/include/raft/mr/buffer_base.hpp @@ -38,11 +38,11 @@ namespace mr { template class buffer_base { public: - using size_type = std::size_t; - using value_type = T; - using iterator = value_type*; - using const_iterator = const value_type*; - using reference = T&; + using size_type = std::size_t; + using value_type = T; + using iterator = value_type*; + using const_iterator = const value_type*; + using reference = T&; using const_reference = const T&; buffer_base() = delete; @@ -58,16 +58,12 @@ class buffer_base { * @param[in] stream cuda stream where this allocation operations are async * @param[in] n size of the buffer (in number of elements) */ - buffer_base(std::shared_ptr allocator, cudaStream_t stream, - size_type n = 0) - : data_(nullptr), - size_(n), - capacity_(n), - stream_(stream), - allocator_(std::move(allocator)) { + buffer_base(std::shared_ptr allocator, cudaStream_t stream, size_type n = 0) + : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator)) + { if (capacity_ > 0) { - data_ = static_cast( - allocator_->allocate(capacity_ * sizeof(value_type), stream_)); + data_ = + static_cast(allocator_->allocate(capacity_ * sizeof(value_type), stream_)); CUDA_CHECK(cudaStreamSynchronize(stream_)); } } @@ -100,23 +96,23 @@ class buffer_base { * @param[in] new_capacity new capacity (in number of elements) * @{ */ - void reserve(size_type new_capacity) { + void reserve(size_type new_capacity) + { if (new_capacity > capacity_) { - auto* new_data = static_cast( - allocator_->allocate(new_capacity * sizeof(value_type), stream_)); - if (size_ > 0) { - raft::copy(new_data, data_, size_, stream_); - } + auto* new_data = + static_cast(allocator_->allocate(new_capacity * sizeof(value_type), stream_)); + if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); } // Only deallocate if we have allocated a pointer if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = new_data; + data_ = new_data; capacity_ = new_capacity; } } - void reserve(size_type new_capacity, cudaStream_t stream) { + void reserve(size_type new_capacity, cudaStream_t stream) + { set_stream(stream); reserve(new_capacity); } @@ -128,12 +124,14 @@ class buffer_base { * @param[in] new_size new buffer size * @{ */ - void resize(const size_type new_size) { + void resize(const size_type new_size) + { reserve(new_size); size_ = new_size; } - void resize(const size_type new_size, cudaStream_t stream) { + void resize(const size_type new_size, cudaStream_t stream) + { set_stream(stream); resize(new_size); } @@ -145,16 +143,18 @@ class buffer_base { * If this method is not explicitly called, it will be during the destructor * @{ */ - void release() { + void release() + { if (nullptr != data_) { allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_); } - data_ = nullptr; + data_ = nullptr; capacity_ = 0; - size_ = 0; + size_ = 0; } - void release(cudaStream_t stream) { + void release(cudaStream_t stream) + { set_stream(stream); release(); } @@ -194,7 +194,8 @@ class buffer_base { * @param[in] stream new cuda stream to be set. If it is the same as the * current one, then this method will be a no-op. */ - void set_stream(cudaStream_t stream) { + void set_stream(cudaStream_t stream) + { if (stream_ != stream) { cudaEvent_t event; CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp index 3d1ce38c31..8d306a199f 100644 --- a/cpp/include/raft/mr/device/allocator.hpp +++ b/cpp/include/raft/mr/device/allocator.hpp @@ -34,17 +34,20 @@ namespace device { * further to the ones listed in `Allocator`: * - Allocations may be always on the device that was specified on construction. */ -class allocator : public base_allocator {}; +class allocator : public base_allocator { +}; /** Default device allocator based on the one provided by RMM */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override { + void* allocate(std::size_t n, cudaStream_t stream) override + { void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { + void deallocate(void* p, std::size_t n, cudaStream_t stream) override + { rmm::mr::get_current_device_resource()->deallocate(p, n, stream); } }; // class default_allocator diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp index 39b5674ce4..2b9d84368f 100644 --- a/cpp/include/raft/mr/device/buffer.hpp +++ b/cpp/include/raft/mr/device/buffer.hpp @@ -46,11 +46,11 @@ namespace device { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -60,7 +60,9 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} + : buffer_base(alloc, stream, n) + { + } }; // class buffer }; // namespace device diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp index e5b3da24eb..7d31248e7f 100644 --- a/cpp/include/raft/mr/host/allocator.hpp +++ b/cpp/include/raft/mr/host/allocator.hpp @@ -35,20 +35,23 @@ namespace host { * further to the ones listed in `Allocator`: * - Allocations don't need to be zero copy accessible form a device. */ -class allocator : public base_allocator {}; +class allocator : public base_allocator { +}; /** Default cudaMallocHost/cudaFreeHost based host allocator */ class default_allocator : public allocator { public: - void* allocate(std::size_t n, cudaStream_t stream) override { + void* allocate(std::size_t n, cudaStream_t stream) override + { void* ptr = nullptr; CUDA_CHECK(cudaMallocHost(&ptr, n)); return ptr; } - void deallocate(void* p, std::size_t n, cudaStream_t stream) override { - //Must call _NO_THROW here since this is called frequently from object - //destructors which are "nothrow" by default + void deallocate(void* p, std::size_t n, cudaStream_t stream) override + { + // Must call _NO_THROW here since this is called frequently from object + // destructors which are "nothrow" by default CUDA_CHECK_NO_THROW(cudaFreeHost(p)); } }; // class default_allocator diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp index 3c505bf2ed..52475ad6ec 100644 --- a/cpp/include/raft/mr/host/buffer.hpp +++ b/cpp/include/raft/mr/host/buffer.hpp @@ -48,11 +48,11 @@ namespace host { template class buffer : public buffer_base { public: - using size_type = typename buffer_base::size_type; - using value_type = typename buffer_base::value_type; - using iterator = typename buffer_base::iterator; - using const_iterator = typename buffer_base::const_iterator; - using reference = typename buffer_base::reference; + using size_type = typename buffer_base::size_type; + using value_type = typename buffer_base::value_type; + using iterator = typename buffer_base::iterator; + using const_iterator = typename buffer_base::const_iterator; + using reference = typename buffer_base::reference; using const_reference = typename buffer_base::const_reference; buffer() = delete; @@ -62,14 +62,15 @@ class buffer : public buffer_base { buffer& operator=(const buffer& other) = delete; buffer(std::shared_ptr alloc, const device::buffer& other) - : buffer_base(alloc, other.get_stream(), other.size()) { - if (other.size() > 0) { - raft::copy(data_, other.data(), other.size(), other.get_stream()); - } + : buffer_base(alloc, other.get_stream(), other.size()) + { + if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); } } buffer(std::shared_ptr alloc, cudaStream_t stream, size_type n = 0) - : buffer_base(alloc, stream, n) {} + : buffer_base(alloc, stream, n) + { + } reference operator[](size_type pos) { return data_[pos]; } diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh index de5fc46452..56a3192f9f 100644 --- a/cpp/include/raft/pow2_utils.cuh +++ b/cpp/include/raft/pow2_utils.cuh @@ -29,14 +29,13 @@ template struct Pow2 { typedef decltype(Value_) Type; static constexpr Type Value = Value_; - static constexpr Type Log2 = log2(Value); - static constexpr Type Mask = Value - 1; + static constexpr Type Log2 = log2(Value); + static constexpr Type Mask = Value - 1; static_assert(std::is_integral::value, "Value must be integral."); static_assert(Value && !(Value & Mask), "Value must be power of two."); -#define Pow2_IsRepresentableAs(I) \ - (std::is_integral::value && Type(I(Value)) == Value) +#define Pow2_IsRepresentableAs(I) (std::is_integral::value && Type(I(Value)) == Value) /** * Integer division by Value truncated toward zero @@ -45,10 +44,9 @@ struct Pow2 { * Invariant: `x = Value * quot(x) + rem(x)` */ template - static constexpr HDI std::enable_if_t quot( - I x) noexcept { - if constexpr (std::is_signed::value) - return (x >> I(Log2)) + (x < 0 && (x & I(Mask))); + static constexpr HDI std::enable_if_t quot(I x) noexcept + { + if constexpr (std::is_signed::value) return (x >> I(Log2)) + (x < 0 && (x & I(Mask))); if constexpr (std::is_unsigned::value) return x >> I(Log2); } @@ -59,10 +57,9 @@ struct Pow2 { * Invariant: `x = Value * quot(x) + rem(x)`. */ template - static constexpr HDI std::enable_if_t rem( - I x) noexcept { - if constexpr (std::is_signed::value) - return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask)); + static constexpr HDI std::enable_if_t rem(I x) noexcept + { + if constexpr (std::is_signed::value) return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask)); if constexpr (std::is_unsigned::value) return x & I(Mask); } @@ -77,8 +74,8 @@ struct Pow2 { * compared to normal C++ operators `/` and `%`. */ template - static constexpr HDI std::enable_if_t div( - I x) noexcept { + static constexpr HDI std::enable_if_t div(I x) noexcept + { return x >> I(Log2); } @@ -94,8 +91,8 @@ struct Pow2 { * compared to normal C++ operators `/` and `%`. */ template - static constexpr HDI std::enable_if_t mod( - I x) noexcept { + static constexpr HDI std::enable_if_t mod(I x) noexcept + { return x & I(Mask); } @@ -108,16 +105,17 @@ struct Pow2 { * NB: for pointers, the alignment is checked in bytes, not in elements. */ template - static constexpr HDI bool isAligned(PtrT p) noexcept { + static constexpr HDI bool isAligned(PtrT p) noexcept + { Pow2_CHECK_TYPE(PtrT); if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0; - if constexpr (!Pow2_IsRepresentableAs(PtrT)) - return mod(reinterpret_cast(p)) == 0; + if constexpr (!Pow2_IsRepresentableAs(PtrT)) return mod(reinterpret_cast(p)) == 0; } /** Tell whether two pointers have the same address modulo Value. */ template - static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept { + static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept + { Pow2_CHECK_TYPE(PtrT); Pow2_CHECK_TYPE(PtrS); Type x, y; @@ -134,10 +132,10 @@ struct Pow2 { /** Get this or next Value-aligned address (in bytes) or integral. */ template - static constexpr HDI PtrT roundUp(PtrT p) noexcept { + static constexpr HDI PtrT roundUp(PtrT p) noexcept + { Pow2_CHECK_TYPE(PtrT); - if constexpr (Pow2_IsRepresentableAs(PtrT)) - return p + PtrT(Mask) - mod(p + PtrT(Mask)); + if constexpr (Pow2_IsRepresentableAs(PtrT)) return p + PtrT(Mask) - mod(p + PtrT(Mask)); if constexpr (!Pow2_IsRepresentableAs(PtrT)) { auto x = reinterpret_cast(p); return reinterpret_cast(x + Mask - mod(x + Mask)); @@ -146,7 +144,8 @@ struct Pow2 { /** Get this or previous Value-aligned address (in bytes) or integral. */ template - static constexpr HDI PtrT roundDown(PtrT p) noexcept { + static constexpr HDI PtrT roundDown(PtrT p) noexcept + { Pow2_CHECK_TYPE(PtrT); if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p); if constexpr (!Pow2_IsRepresentableAs(PtrT)) { diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh index 654c46bbf9..0f3b58975e 100644 --- a/cpp/include/raft/random/detail/rng_impl.cuh +++ b/cpp/include/raft/random/detail/rng_impl.cuh @@ -44,19 +44,20 @@ enum GeneratorType { }; template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, - Type sigma2, Type mu2) { - constexpr Type twoPi = Type(2.0) * Type(3.141592654); +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2) +{ + constexpr Type twoPi = Type(2.0) * Type(3.141592654); constexpr Type minus2 = -Type(2.0); - Type R = raft::mySqrt(minus2 * raft::myLog(val1)); - Type theta = twoPi * val2; + Type R = raft::mySqrt(minus2 * raft::myLog(val1)); + Type theta = twoPi * val2; Type s, c; raft::mySinCos(theta, s, c); val1 = R * c * sigma1 + mu1; val2 = R * s * sigma2 + mu2; } template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1) +{ box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu1); } @@ -67,10 +68,13 @@ DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { template struct Generator { DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) - : gen(seed, subsequence, offset) {} + : gen(seed, subsequence, offset) + { + } template - DI void next(Type &ret) { + DI void next(Type& ret) + { gen.next(ret); } @@ -79,10 +83,9 @@ struct Generator { GenType gen; }; -template -__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, - LenType len, Lambda randOp) { +template +__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp) +{ LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -94,10 +97,10 @@ __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, } // used for Box-Muller type transformations -template -__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, - LenType len, Lambda2 rand2Op) { +template +__global__ void rand2Kernel( + uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op) +{ LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; @@ -113,8 +116,9 @@ __global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr, } template -__global__ void constFillKernel(Type *ptr, int len, Type val) { - unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; +__global__ void constFillKernel(Type* ptr, int len, Type val) +{ + unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x; const unsigned stride = gridDim.x * blockDim.x; for (unsigned idx = tid; idx < len; idx += stride) { ptr[idx] = val; @@ -130,7 +134,8 @@ struct PhiloxGenerator { * @param subsequence as found in curand docs * @param offset as found in curand docs */ - DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) + { curand_init(seed, subsequence, offset, &state); } @@ -138,21 +143,24 @@ struct PhiloxGenerator { * @defgroup NextRand Generate the next random number * @{ */ - DI void next(float &ret) { ret = curand_uniform(&(this->state)); } - DI void next(double &ret) { ret = curand_uniform_double(&(this->state)); } - DI void next(uint32_t &ret) { ret = curand(&(this->state)); } - DI void next(uint64_t &ret) { + DI void next(float& ret) { ret = curand_uniform(&(this->state)); } + DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); } + DI void next(uint32_t& ret) { ret = curand(&(this->state)); } + DI void next(uint64_t& ret) + { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t &ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t &ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -173,8 +181,9 @@ struct TapsGenerator { * @param subsequence unused * @param offset unused */ - DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; + DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) + { + uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; uint64_t stride = blockDim.x * gridDim.x; delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride; stride *= blockDim.y * gridDim.y; @@ -187,31 +196,36 @@ struct TapsGenerator { * @{ */ template - DI void next(Type &ret) { + DI void next(Type& ret) + { constexpr double ULL_LARGE = 1.8446744073709551614e19; uint64_t val; next(val); ret = static_cast(val); ret /= static_cast(ULL_LARGE); } - DI void next(uint64_t &ret) { + DI void next(uint64_t& ret) + { constexpr uint64_t TAPS = 0x8000100040002000ULL; - constexpr int ROUNDS = 128; + constexpr int ROUNDS = 128; for (int i = 0; i < ROUNDS; i++) state = (state >> 1) ^ (-(state & 1ULL) & TAPS); ret = state; } - DI void next(uint32_t &ret) { + DI void next(uint32_t& ret) + { uint64_t val; next(val); ret = (uint32_t)val; } - DI void next(int32_t &ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t &ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -232,46 +246,49 @@ struct Kiss99Generator { * @param subsequence unused * @param offset unused */ - DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - initKiss99(seed); - } + DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); } /** * @defgroup NextRand Generate the next random number * @{ */ template - DI void next(Type &ret) { + DI void next(Type& ret) + { constexpr double U_LARGE = 4.294967295e9; uint32_t val; next(val); ret = static_cast(val); ret /= static_cast(U_LARGE); } - DI void next(uint32_t &ret) { + DI void next(uint32_t& ret) + { uint32_t MWC; - z = 36969 * (z & 65535) + (z >> 16); - w = 18000 * (w & 65535) + (w >> 16); + z = 36969 * (z & 65535) + (z >> 16); + w = 18000 * (w & 65535) + (w >> 16); MWC = ((z << 16) + w); jsr ^= (jsr << 17); jsr ^= (jsr >> 13); jsr ^= (jsr << 5); jcong = 69069 * jcong + 1234567; - MWC = ((MWC ^ jcong) + jsr); - ret = MWC; + MWC = ((MWC ^ jcong) + jsr); + ret = MWC; } - DI void next(uint64_t &ret) { + DI void next(uint64_t& ret) + { uint32_t a, b; next(a); next(b); ret = (uint64_t)a | ((uint64_t)b << 32); } - DI void next(int32_t &ret) { + DI void next(int32_t& ret) + { uint32_t val; next(val); ret = int32_t(val & 0x7fffffff); } - DI void next(int64_t &ret) { + DI void next(int64_t& ret) + { uint64_t val; next(val); ret = int64_t(val & 0x7fffffffffffffff); @@ -290,7 +307,8 @@ struct Kiss99Generator { // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower // 128 bits. It uses 32-bit wide multiply only. - DI void mulByFnv1a128Prime(uint32_t *h) { + DI void mulByFnv1a128Prime(uint32_t* h) + { typedef union { uint32_t u32[2]; uint64_t u64[1]; @@ -314,12 +332,12 @@ struct Kiss99Generator { // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]); // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]); uint32_t carry = 0; - h[0] = h0p0.u32[0]; + h[0] = h0p0.u32[0]; - h[1] = h0p0.u32[1] + h1p0.u32[0]; + h[1] = h0p0.u32[1] + h1p0.u32[0]; carry = h[1] < h0p0.u32[1] ? 1 : 0; - h[2] = h1p0.u32[1] + carry; + h[2] = h1p0.u32[1] + carry; carry = h[2] < h1p0.u32[1] ? 1 : 0; h[2] += h2p0.u32[0]; carry = h[2] < h2p0.u32[0] ? carry + 1 : carry; @@ -330,7 +348,8 @@ struct Kiss99Generator { return; } - DI void fnv1a128(uint32_t *hash, uint32_t txt) { + DI void fnv1a128(uint32_t* hash, uint32_t txt) + { hash[0] ^= (txt >> 0) & 0xFF; mulByFnv1a128Prime(hash); hash[0] ^= (txt >> 8) & 0xFF; @@ -341,7 +360,8 @@ struct Kiss99Generator { mulByFnv1a128Prime(hash); } - DI void initKiss99(uint64_t seed) { + DI void initKiss99(uint64_t seed) + { // Initialize hash to 128-bit FNV1a basis uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL}; @@ -356,9 +376,9 @@ struct Kiss99Generator { fnv1a128(hash, uint32_t(seed >> 32)); // Initialize KISS99 state with hash - z = hash[0]; - w = hash[1]; - jsr = hash[2]; + z = hash[0]; + w = hash[1]; + jsr = hash[2]; jcong = hash[3]; } }; @@ -372,17 +392,20 @@ class RngImpl { // simple heuristic to make sure all SMs will be occupied properly // and also not too many initialization calls will be made by each thread nBlocks(4 * getMultiProcessorCount()), - gen() { + gen() + { seed(_s); } - void seed(uint64_t _s) { + void seed(uint64_t _s) + { gen.seed(_s); offset = 0; } template - void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { + void affine_transform_params(IdxT n, IdxT& a, IdxT& b) + { // always keep 'a' to be coprime to 'n' a = gen() % n; while (gcd(a, n) != 1) { @@ -394,128 +417,150 @@ class RngImpl { } template - void uniform(Type *ptr, LenType len, Type start, Type end, - cudaStream_t stream) { + void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'uniform' can only be floating point!"); custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return (val * (end - start)) + start; - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; }, stream); } template - void uniformInt(IntType *ptr, LenType len, IntType start, IntType end, - cudaStream_t stream) { - static_assert(std::is_integral::value, - "Type for 'uniformInt' can only be integer!"); + void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream) + { + static_assert(std::is_integral::value, "Type for 'uniformInt' can only be integer!"); custom_distribution( - ptr, len, - [=] __device__(IntType val, LenType idx) { - return (val % (end - start)) + start; - }, + ptr, + len, + [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; }, stream); } template - void normal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'normal' can only be floating point!"); rand2Impl( - offset, ptr, len, + offset, + ptr, + len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma, - cudaStream_t stream) { - static_assert(std::is_integral::value, - "Type for 'normalInt' can only be integer!"); + void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream) + { + static_assert(std::is_integral::value, "Type for 'normalInt' can only be integer!"); rand2Impl( - offset, ptr, len, - [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) { + offset, + ptr, + len, + [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, - const Type *sigma_vec, Type sigma, cudaStream_t stream) { + void normalTable(Type* ptr, + LenType n_rows, + LenType n_cols, + const Type* mu, + const Type* sigma_vec, + Type sigma, + cudaStream_t stream) + { rand2Impl( - offset, ptr, n_rows * n_cols, + offset, + ptr, + n_rows * n_cols, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { // yikes! use fast-int-div - auto col1 = idx1 % n_cols; - auto col2 = idx2 % n_cols; + auto col1 = idx1 % n_cols; + auto col2 = idx2 % n_cols; auto mean1 = mu[col1]; auto mean2 = mu[col2]; - auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; - auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; + auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1]; + auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2]; box_muller_transform(val1, val2, sig1, mean1, sig2, mean2); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { - detail::constFillKernel - <<>>(ptr, len, val); + void fill(Type* ptr, LenType len, Type val, cudaStream_t stream) + { + detail::constFillKernel<<>>(ptr, len, val); CUDA_CHECK(cudaPeekAtLastError()); } template - void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { + void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream) + { custom_distribution( - ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, - stream); + ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream); } template - void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, - cudaStream_t stream) { + void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream) + { static_assert(std::is_floating_point::value, "Type for 'scaled_bernoulli' can only be floating point!"); custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return val > prob ? -scale : scale; - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; }, stream); } template - void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { + void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream) + { custom_distribution( - ptr, len, - [=] __device__(Type val, LenType idx) { - return mu - beta * raft::myLog(-raft::myLog(val)); - }, + ptr, + len, + [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); }, stream); } template - void lognormal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { rand2Impl( - offset, ptr, len, + offset, + ptr, + len, [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) { box_muller_transform(val1, val2, sigma, mu); val1 = raft::myExp(val1); val2 = raft::myExp(val2); }, - NumThreads, nBlocks, type, stream); + NumThreads, + nBlocks, + type, + stream); } template - void logistic(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return mu - scale * raft::myLog(one / val - one); @@ -524,9 +569,11 @@ class RngImpl { } template - void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { + void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; return -raft::myLog(one - val) / lambda; @@ -535,9 +582,11 @@ class RngImpl { } template - void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { + void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { constexpr Type one = (Type)1.0; constexpr Type two = (Type)2.0; @@ -547,13 +596,14 @@ class RngImpl { } template - void laplace(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { custom_distribution( - ptr, len, + ptr, + len, [=] __device__(Type val, LenType idx) { - constexpr Type one = (Type)1.0; - constexpr Type two = (Type)2.0; + constexpr Type one = (Type)1.0; + constexpr Type two = (Type)2.0; constexpr Type oneHalf = (Type)0.5; Type out; if (val <= oneHalf) { @@ -567,55 +617,55 @@ class RngImpl { } template - void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, - IdxT *outIdx, const DataT *in, - const WeightsT *wts, IdxT sampledLen, IdxT len, - cudaStream_t stream) { - ASSERT(sampledLen <= len, - "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); + void sampleWithoutReplacement(const raft::handle_t& handle, + DataT* out, + IdxT* outIdx, + const DataT* in, + const WeightsT* wts, + IdxT sampledLen, + IdxT len, + cudaStream_t stream) + { + ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'."); rmm::device_uvector expWts(len, stream); rmm::device_uvector sortedWts(len, stream); rmm::device_uvector inIdx(len, stream); rmm::device_uvector outIdxBuff(len, stream); - auto *inIdxPtr = inIdx.data(); + auto* inIdxPtr = inIdx.data(); // generate modified weights custom_distribution( - expWts.data(), len, + expWts.data(), + len, [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) { - inIdxPtr[idx] = idx; + inIdxPtr[idx] = idx; constexpr WeightsT one = (WeightsT)1.0; - auto exp = -raft::myLog(one - val); - if (wts != nullptr) { - return exp / wts[idx]; - } + auto exp = -raft::myLog(one - val); + if (wts != nullptr) { return exp / wts[idx]; } return exp; }, stream); ///@todo: use a more efficient partitioning scheme instead of full sort // sort the array and pick the top sampledLen items - IdxT *outIdxPtr = outIdxBuff.data(); + IdxT* outIdxPtr = outIdxBuff.data(); rmm::device_uvector workspace(0, stream); - sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, - (int)len, stream); + sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream); if (outIdx != nullptr) { - CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync( + outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream)); } raft::scatter(out, in, outIdxPtr, sampledLen, stream); } - template - void custom_distribution(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { randImpl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } - template - void custom_distribution2(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { rand2Impl( offset, ptr, len, randOp, NumThreads, nBlocks, type, stream); } @@ -625,10 +675,10 @@ class RngImpl { /** generator type */ GeneratorType type; /** - * offset is also used to initialize curand state. - * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64), - * but is still a large period. - */ + * offset is also used to initialize curand state. + * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64), + * but is still a large period. + */ uint64_t offset; /** number of blocks to launch */ int nBlocks; @@ -638,12 +688,10 @@ class RngImpl { static const int NumThreads = 256; template - uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len, - int nThreads, int nBlocks) { + uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks) + { LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads)); - if (IsNormal && itemsPerThread % 2 == 1) { - ++itemsPerThread; - } + if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; } // curand uses 2 32b uint's to generate one double uint64_t factor = sizeof(Type) / sizeof(float); if (factor == 0) ++factor; @@ -651,72 +699,72 @@ class RngImpl { // If not, then generate new seed and start from zero offset uint64_t newOffset = offset + LenType(itemsPerThread) * factor; if (newOffset < offset) { - offset = 0; - seed = gen(); + offset = 0; + seed = gen(); newOffset = itemsPerThread * factor; } return newOffset; } - template - void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp, - int nThreads, int nBlocks, GeneratorType type, - cudaStream_t stream) { + template + void randImpl(uint64_t& offset, + OutType* ptr, + LenType len, + Lambda randOp, + int nThreads, + int nBlocks, + GeneratorType type, + cudaStream_t stream) + { if (len <= 0) return; - uint64_t seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, - nThreads, nBlocks); + uint64_t seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); switch (type) { case GenPhilox: - detail::randKernel + detail::randKernel <<>>(seed, offset, ptr, len, randOp); break; case GenTaps: - detail::randKernel + detail::randKernel <<>>(seed, offset, ptr, len, randOp); break; case GenKiss99: - detail::randKernel + detail::randKernel <<>>(seed, offset, ptr, len, randOp); break; - default: - ASSERT(false, "randImpl: Incorrect generator type! %d", type); + default: ASSERT(false, "randImpl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; } - template - void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op, - int nThreads, int nBlocks, GeneratorType type, - cudaStream_t stream) { + template + void rand2Impl(uint64_t& offset, + OutType* ptr, + LenType len, + Lambda2 rand2Op, + int nThreads, + int nBlocks, + GeneratorType type, + cudaStream_t stream) + { if (len <= 0) return; - auto seed = gen(); - auto newOffset = _setupSeeds(seed, offset, len, - nThreads, nBlocks); + auto seed = gen(); + auto newOffset = _setupSeeds(seed, offset, len, nThreads, nBlocks); switch (type) { case GenPhilox: - detail::rand2Kernel + detail::rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; case GenTaps: - detail::rand2Kernel + detail::rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; case GenKiss99: - detail::rand2Kernel + detail::rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; - default: - ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); + default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type); }; CUDA_CHECK(cudaGetLastError()); offset = newOffset; diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp index b6b0911ab0..0cced7c626 100644 --- a/cpp/include/raft/random/rng.hpp +++ b/cpp/include/raft/random/rng.hpp @@ -51,12 +51,13 @@ using detail::Kiss99Generator; * @{ */ template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, - Type sigma2, Type mu2) { +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2) +{ detail::box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu2); } template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { +DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1) +{ detail::box_muller_transform(val1, val2, sigma1, mu1); } /** @} */ @@ -92,7 +93,8 @@ class Rng : public detail::RngImpl { * @param[out] b intercept parameter */ template - void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { + void affine_transform_params(IdxT n, IdxT& a, IdxT& b) + { detail::RngImpl::affine_transform_params(n, a, b); } @@ -108,13 +110,13 @@ class Rng : public detail::RngImpl { * @{ */ template - void uniform(Type *ptr, LenType len, Type start, Type end, - cudaStream_t stream) { + void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream) + { detail::RngImpl::uniform(ptr, len, start, end, stream); } template - void uniformInt(IntType *ptr, LenType len, IntType start, IntType end, - cudaStream_t stream) { + void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream) + { detail::RngImpl::uniformInt(ptr, len, start, end, stream); } /** @} */ @@ -131,13 +133,13 @@ class Rng : public detail::RngImpl { * @{ */ template - void normal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { detail::RngImpl::normal(ptr, len, mu, sigma, stream); } template - void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma, - cudaStream_t stream) { + void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream) + { detail::RngImpl::normalInt(ptr, len, mu, sigma, stream); } /** @} */ @@ -163,10 +165,15 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, - const Type *sigma_vec, Type sigma, cudaStream_t stream) { - detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma, - stream); + void normalTable(Type* ptr, + LenType n_rows, + LenType n_cols, + const Type* mu, + const Type* sigma_vec, + Type sigma, + cudaStream_t stream) + { + detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma, stream); } /** @@ -179,7 +186,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { + void fill(Type* ptr, LenType len, Type val, cudaStream_t stream) + { detail::RngImpl::fill(ptr, len, val, stream); } @@ -196,7 +204,8 @@ class Rng : public detail::RngImpl { * @param[in] stream stream where to launch the kernel */ template - void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { + void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream) + { detail::RngImpl::bernoulli(ptr, len, prob, stream); } @@ -211,8 +220,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, - cudaStream_t stream) { + void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream) + { detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream); } @@ -228,7 +237,8 @@ class Rng : public detail::RngImpl { * @note https://en.wikipedia.org/wiki/Gumbel_distribution */ template - void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { + void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream) + { detail::RngImpl::gumbel(ptr, len, mu, beta, stream); } @@ -243,8 +253,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void lognormal(Type *ptr, LenType len, Type mu, Type sigma, - cudaStream_t stream) { + void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) + { detail::RngImpl::lognormal(ptr, len, mu, sigma, stream); } @@ -259,8 +269,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void logistic(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { detail::RngImpl::logistic(ptr, len, mu, scale, stream); } @@ -274,7 +284,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { + void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream) + { detail::RngImpl::exponential(ptr, len, lambda, stream); } @@ -288,7 +299,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { + void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream) + { detail::RngImpl::rayleigh(ptr, len, sigma, stream); } @@ -303,8 +315,8 @@ class Rng : public detail::RngImpl { * @param stream stream where to launch the kernel */ template - void laplace(Type *ptr, LenType len, Type mu, Type scale, - cudaStream_t stream) { + void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream) + { detail::RngImpl::laplace(ptr, len, mu, scale, stream); } @@ -334,12 +346,17 @@ class Rng : public detail::RngImpl { * @param stream cuda stream */ template - void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, - IdxT *outIdx, const DataT *in, - const WeightsT *wts, IdxT sampledLen, IdxT len, - cudaStream_t stream) { - detail::RngImpl::sampleWithoutReplacement(handle, out, outIdx, in, wts, - sampledLen, len, stream); + void sampleWithoutReplacement(const raft::handle_t& handle, + DataT* out, + IdxT* outIdx, + const DataT* in, + const WeightsT* wts, + IdxT sampledLen, + IdxT len, + cudaStream_t stream) + { + detail::RngImpl::sampleWithoutReplacement( + handle, out, outIdx, in, wts, sampledLen, len, stream); } /** @@ -357,16 +374,14 @@ class Rng : public detail::RngImpl { * @param[in] stream cuda stream * @{ */ - template - void custom_distribution(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { detail::RngImpl::custom_distribution(ptr, len, randOp, stream); } - template - void custom_distribution2(OutType *ptr, LenType len, Lambda randOp, - cudaStream_t stream) { + template + void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream) + { detail::RngImpl::custom_distribution2(ptr, len, randOp, stream); } /** @} */ diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh index e367550060..5d38bdf4a8 100644 --- a/cpp/include/raft/sparse/convert/coo.cuh +++ b/cpp/include/raft/sparse/convert/coo.cuh @@ -37,14 +37,18 @@ namespace sparse { namespace convert { template -__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, - value_idx *coo_rows, value_idx nnz) { +__global__ void csr_to_coo_kernel(const value_idx* row_ind, + value_idx m, + value_idx* coo_rows, + value_idx nnz) +{ // row-based matrix 1 thread per row value_idx row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { value_idx start_idx = row_ind[row]; - value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); - for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row; + value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind); + for (value_idx i = start_idx; i < stop_idx; i++) + coo_rows[i] = row; } } @@ -57,14 +61,14 @@ __global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m, * @param stream: cuda stream to use */ template -void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows, - value_idx nnz, cudaStream_t stream) { +void csr_to_coo( + const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream) +{ // @TODO: Use cusparse for this. dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_to_coo_kernel - <<>>(row_ind, m, coo_rows, nnz); + csr_to_coo_kernel<<>>(row_ind, m, coo_rows, nnz); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh index 79b18ebd0a..2569b5d90f 100644 --- a/cpp/include/raft/sparse/convert/csr.cuh +++ b/cpp/include/raft/sparse/convert/csr.cuh @@ -43,28 +43,32 @@ namespace sparse { namespace convert { template -void coo_to_csr(const raft::handle_t &handle, const int *srcRows, - const int *srcCols, const value_t *srcVals, int nnz, int m, - int *dst_offsets, int *dstCols, value_t *dstVals) { - auto stream = handle.get_stream(); +void coo_to_csr(const raft::handle_t& handle, + const int* srcRows, + const int* srcCols, + const value_t* srcVals, + int nnz, + int m, + int* dst_offsets, + int* dstCols, + value_t* dstVals) +{ + auto stream = handle.get_stream(); auto cusparseHandle = handle.get_cusparse_handle(); rmm::device_uvector dstRows(nnz, stream); - CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, - cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, - cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream)); auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt( cusparseHandle, m, m, nnz, srcRows, srcCols, stream); rmm::device_uvector pBuffer(buffSize, stream); rmm::device_uvector P(nnz, stream); - CUSPARSE_CHECK( - cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); - raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(), - dstCols, P.data(), pBuffer.data(), stream); - raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), - stream); - raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, - dst_offsets, stream); + CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data())); + raft::sparse::cusparsecoosortByRow( + cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream); + raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream); + raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -83,14 +87,20 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows, * @param stream cuda stream to use * @param fused_op: the fused operation */ -template void> -void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - Index_ batchSize, const bool *adj, - Index_ *row_ind_ptr, cudaStream_t stream, - Lambda fused_op) { +template void> +void csr_adj_graph_batched(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + Index_ batchSize, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream, + Lambda fused_op) +{ op::csr_row_op( - row_ind, batchSize, nnz, + row_ind, + batchSize, + nnz, [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__( Index_ row, Index_ start_idx, Index_ stop_idx) { fused_op(row, start_idx, stop_idx); @@ -106,14 +116,23 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, stream); } -template void> -void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - Index_ batchSize, const bool *adj, - Index_ *row_ind_ptr, cudaStream_t stream) { - csr_adj_graph_batched( - row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, - [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); +template void> +void csr_adj_graph_batched(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + Index_ batchSize, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream) +{ + csr_adj_graph_batched(row_ind, + total_rows, + nnz, + batchSize, + adj, + row_ind_ptr, + stream, + [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {}); } /** @@ -129,13 +148,17 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param stream cuda stream to use * @param fused_op the fused operation */ -template void> -void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, - const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream, - Lambda fused_op) { - csr_adj_graph_batched(row_ind, total_rows, nnz, total_rows, - adj, row_ind_ptr, stream, fused_op); +template void> +void csr_adj_graph(const Index_* row_ind, + Index_ total_rows, + Index_ nnz, + const bool* adj, + Index_* row_ind_ptr, + cudaStream_t stream, + Lambda fused_op) +{ + csr_adj_graph_batched( + row_ind, total_rows, nnz, total_rows, adj, row_ind_ptr, stream, fused_op); } /** @@ -148,8 +171,8 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, - cudaStream_t stream) { +void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream) +{ rmm::device_uvector row_counts(m, stream); CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream)); @@ -157,11 +180,9 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = - thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); - exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, - c_ind_d); + thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); } /** @@ -172,7 +193,8 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m, * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO *coo, int *row_ind, cudaStream_t stream) { +void sorted_coo_to_csr(COO* coo, int* row_ind, cudaStream_t stream) +{ sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream); } diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh index 299f9d36d4..e90882b501 100644 --- a/cpp/include/raft/sparse/convert/dense.cuh +++ b/cpp/include/raft/sparse/convert/dense.cuh @@ -37,22 +37,20 @@ namespace sparse { namespace convert { template -__global__ void csr_to_dense_warp_per_row_kernel(int n_cols, - const value_t *csrVal, - const int *csrRowPtr, - const int *csrColInd, - value_t *a) { +__global__ void csr_to_dense_warp_per_row_kernel( + int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a) +{ int row = blockIdx.x; int tid = threadIdx.x; int colStart = csrRowPtr[row]; - int colEnd = csrRowPtr[row + 1]; - int rowNnz = colEnd - colStart; + int colEnd = csrRowPtr[row + 1]; + int rowNnz = colEnd - colStart; for (int i = tid; i < rowNnz; i += blockDim.x) { int colIdx = colStart + i; if (colIdx < colEnd) { - int col = csrColInd[colIdx]; + int col = csrColInd[colIdx]; a[row * n_cols + col] = csrVal[colIdx]; } } @@ -77,10 +75,17 @@ __global__ void csr_to_dense_warp_per_row_kernel(int n_cols, * @param[in] row_major : Is row-major output desired? */ template -void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, - const value_idx *csr_indptr, const value_idx *csr_indices, - const value_t *csr_data, value_idx lda, value_t *out, - cudaStream_t stream, bool row_major = true) { +void csr_to_dense(cusparseHandle_t handle, + value_idx nrows, + value_idx ncols, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data, + value_idx lda, + value_t* out, + cudaStream_t stream, + bool row_major = true) +{ if (!row_major) { /** * If we need col-major, use cusparse. @@ -91,15 +96,13 @@ void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols, CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL)); CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense( - handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, - lda, stream)); + handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream)); CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat)); } else { int blockdim = block_dim(ncols); - CUDA_CHECK( - cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); + CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream)); csr_to_dense_warp_per_row_kernel<<>>( ncols, csr_data, csr_indptr, csr_indices, out); } diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh index fa21614f8f..ad1bac1e75 100644 --- a/cpp/include/raft/sparse/coo.cuh +++ b/cpp/include/raft/sparse/coo.cuh @@ -66,79 +66,79 @@ class COO { Index_Type n_cols; /** - * @param stream: CUDA stream to use - */ + * @param stream: CUDA stream to use + */ COO(cudaStream_t stream) - : rows_arr(0, stream), - cols_arr(0, stream), - vals_arr(0, stream), - nnz(0), - n_rows(0), - n_cols(0) {} + : rows_arr(0, stream), cols_arr(0, stream), vals_arr(0, stream), nnz(0), n_rows(0), n_cols(0) + { + } /** - * @param rows: coo rows array - * @param cols: coo cols array - * @param vals: coo vals array - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - */ - COO(rmm::device_uvector &rows, - rmm::device_uvector &cols, rmm::device_uvector &vals, - Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0) - : rows_arr(rows), - cols_arr(cols), - vals_arr(vals), - nnz(nnz), - n_rows(n_rows), - n_cols(n_cols) {} + * @param rows: coo rows array + * @param cols: coo cols array + * @param vals: coo vals array + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + */ + COO(rmm::device_uvector& rows, + rmm::device_uvector& cols, + rmm::device_uvector& vals, + Index_Type nnz, + Index_Type n_rows = 0, + Index_Type n_cols = 0) + : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols) + { + } /** - * @param stream: CUDA stream to use - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - * @param init: initialize arrays with zeros - */ - COO(cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0, - Index_Type n_cols = 0, bool init = true) + * @param stream: CUDA stream to use + * @param nnz: size of the rows/cols/vals arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of cols in the dense matrix + * @param init: initialize arrays with zeros + */ + COO(cudaStream_t stream, + Index_Type nnz, + Index_Type n_rows = 0, + Index_Type n_cols = 0, + bool init = true) : rows_arr(nnz, stream), cols_arr(nnz, stream), vals_arr(nnz, stream), nnz(nnz), n_rows(n_rows), - n_cols(n_cols) { + n_cols(n_cols) + { if (init) init_arrays(stream); } - void init_arrays(cudaStream_t stream) { - CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, - this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, - this->nnz * sizeof(Index_Type), stream)); - CUDA_CHECK( - cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); + void init_arrays(cudaStream_t stream) + { + CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream)); + CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream)); } ~COO() {} /** - * @brief Size should be > 0, with the number of rows - * and cols in the dense matrix being > 0. - */ - bool validate_size() const { + * @brief Size should be > 0, with the number of rows + * and cols in the dense matrix being > 0. + */ + bool validate_size() const + { if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false; return true; } /** - * @brief If the underlying arrays have not been set, - * return false. Otherwise true. - */ - bool validate_mem() const { - if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || - this->vals_arr.size() == 0) { + * @brief If the underlying arrays have not been set, + * return false. Otherwise true. + */ + bool validate_mem() const + { + if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || this->vals_arr.size() == 0) { return false; } @@ -148,33 +148,30 @@ class COO { /* * @brief Returns the rows array */ - Index_Type *rows() { return this->rows_arr.data(); } + Index_Type* rows() { return this->rows_arr.data(); } /** * @brief Returns the cols array */ - Index_Type *cols() { return this->cols_arr.data(); } + Index_Type* cols() { return this->cols_arr.data(); } /** * @brief Returns the vals array */ - T *vals() { return this->vals_arr.data(); } + T* vals() { return this->vals_arr.data(); } /** - * @brief Send human-readable state information to output stream - */ - friend std::ostream &operator<<(std::ostream &out, - const COO &c) { + * @brief Send human-readable state information to output stream + */ + friend std::ostream& operator<<(std::ostream& out, const COO& c) + { if (c.validate_size() && c.validate_mem()) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) - << std::endl; - out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) - << std::endl; - out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) - << std::endl; + out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl; + out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl; + out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl; out << "nnz=" << c.nnz << std::endl; out << "n_rows=" << c.n_rows << std::endl; out << "n_cols=" << c.n_cols << std::endl; @@ -188,58 +185,59 @@ class COO { } /** - * @brief Set the number of rows and cols - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - */ - void setSize(int n_rows, int n_cols) { + * @brief Set the number of rows and cols + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + */ + void setSize(int n_rows, int n_cols) + { this->n_rows = n_rows; this->n_cols = n_cols; } /** - * @brief Set the number of rows and cols for a square dense matrix - * @param n: number of rows and cols - */ - void setSize(int n) { + * @brief Set the number of rows and cols for a square dense matrix + * @param n: number of rows and cols + */ + void setSize(int n) + { this->n_rows = n; this->n_cols = n; } /** - * @brief Allocate the underlying arrays - * @param nnz: size of underlying row/col/val arrays - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, bool init, cudaStream_t stream) { - this->allocate(nnz, 0, init, stream); - } + * @brief Allocate the underlying arrays + * @param nnz: size of underlying row/col/val arrays + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param size: the number of rows/cols in a square dense matrix - * @param init: should values be initialized to 0? - * @param stream: CUDA stream to use - */ - void allocate(int nnz, int size, bool init, cudaStream_t stream) { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param size: the number of rows/cols in a square dense matrix + * @param init: should values be initialized to 0? + * @param stream: CUDA stream to use + */ + void allocate(int nnz, int size, bool init, cudaStream_t stream) + { this->allocate(nnz, size, size, init, stream); } /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - * @param init: should values be initialized to 0? - * @param stream: stream to use for init - */ - void allocate(int nnz, int n_rows, int n_cols, bool init, - cudaStream_t stream) { + * @brief Allocate the underlying arrays + * @param nnz: size of the underlying row/col/val arrays + * @param n_rows: number of rows in the dense matrix + * @param n_cols: number of columns in the dense matrix + * @param init: should values be initialized to 0? + * @param stream: stream to use for init + */ + void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream) + { this->n_rows = n_rows; this->n_cols = n_cols; - this->nnz = nnz; + this->nnz = nnz; this->rows_arr.resize(this->nnz, stream); this->cols_arr.resize(this->nnz, stream); diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh index 041aedf41c..f821ce2b98 100644 --- a/cpp/include/raft/sparse/csr.cuh +++ b/cpp/include/raft/sparse/csr.cuh @@ -41,57 +41,64 @@ namespace sparse { struct WeakCCState { public: - bool *m; - WeakCCState(bool *m) : m(m) {} + bool* m; + WeakCCState(bool* m) : m(m) {} }; template -__global__ void weak_cc_label_device(Index_ *__restrict__ labels, - const Index_ *__restrict__ row_ind, - const Index_ *__restrict__ row_ind_ptr, - Index_ nnz, bool *__restrict__ m, - Index_ start_vertex_id, Index_ batch_size, - Index_ N, Lambda filter_op) { - Index_ tid = threadIdx.x + blockIdx.x * TPB_X; +__global__ void weak_cc_label_device(Index_* __restrict__ labels, + const Index_* __restrict__ row_ind, + const Index_* __restrict__ row_ind_ptr, + Index_ nnz, + bool* __restrict__ m, + Index_ start_vertex_id, + Index_ batch_size, + Index_ N, + Lambda filter_op) +{ + Index_ tid = threadIdx.x + blockIdx.x * TPB_X; Index_ global_id = tid + start_vertex_id; if (tid < batch_size && global_id < N) { Index_ start = __ldg(row_ind + tid); Index_ ci, cj; - bool ci_mod = false; - ci = labels[global_id]; + bool ci_mod = false; + ci = labels[global_id]; bool ci_allow_prop = filter_op(global_id); Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind); /// TODO: add one element to row_ind and avoid get_stop_idx for (Index_ j = start; j < end; j++) { - Index_ j_ind = __ldg(row_ind_ptr + j); - cj = labels[j_ind]; + Index_ j_ind = __ldg(row_ind_ptr + j); + cj = labels[j_ind]; bool cj_allow_prop = filter_op(j_ind); if (ci < cj && ci_allow_prop) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + j_ind), ci); + atomicMin((int*)(labels + j_ind), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int *)(labels + j_ind), ci); + atomicMin((long long int*)(labels + j_ind), ci); if (cj_allow_prop) *m = true; } else if (ci > cj && cj_allow_prop) { - ci = cj; + ci = cj; ci_mod = true; } } if (ci_mod) { if (sizeof(Index_) == 4) - atomicMin((int *)(labels + global_id), ci); + atomicMin((int*)(labels + global_id), ci); else if (sizeof(Index_) == 8) - atomicMin((long long int *)(labels + global_id), ci); + atomicMin((long long int*)(labels + global_id), ci); if (ci_allow_prop) *m = true; } } } template -__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, - Index_ MAX_LABEL, Lambda filter_op) { +__global__ void weak_cc_init_all_kernel(Index_* labels, + Index_ N, + Index_ MAX_LABEL, + Lambda filter_op) +{ Index_ tid = threadIdx.x + blockIdx.x * TPB_X; if (tid < N) { if (filter_op(tid)) @@ -123,22 +130,25 @@ __global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc_batched(Index_ *labels, const Index_ *row_ind, - const Index_ *row_ind_ptr, Index_ nnz, Index_ N, - Index_ start_vertex_id, Index_ batch_size, - WeakCCState *state, cudaStream_t stream, - Lambda filter_op) { - ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, - "Index_ should be 4 or 8 bytes"); +template bool> +void weak_cc_batched(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + Index_ start_vertex_id, + Index_ batch_size, + WeakCCState* state, + cudaStream_t stream, + Lambda filter_op) +{ + ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes"); bool host_m; Index_ MAX_LABEL = std::numeric_limits::max(); weak_cc_init_all_kernel - <<>>( - labels, N, MAX_LABEL, filter_op); + <<>>(labels, N, MAX_LABEL, filter_op); CUDA_CHECK(cudaPeekAtLastError()); int n_iters = 0; @@ -147,8 +157,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, weak_cc_label_device <<>>( - labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, - batch_size, N, filter_op); + labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op); CUDA_CHECK(cudaPeekAtLastError()); //** Updating m * @@ -180,12 +189,25 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param stream the cuda stream to use */ template -void weak_cc_batched(Index_ *labels, const Index_ *row_ind, - const Index_ *row_ind_ptr, Index_ nnz, Index_ N, - Index_ start_vertex_id, Index_ batch_size, - WeakCCState *state, cudaStream_t stream) { - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id, - batch_size, state, stream, +void weak_cc_batched(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + Index_ start_vertex_id, + Index_ batch_size, + WeakCCState* state, + cudaStream_t stream) +{ + weak_cc_batched(labels, + row_ind, + row_ind_ptr, + nnz, + N, + start_vertex_id, + batch_size, + state, + stream, [] __device__(Index_ tid) { return true; }); } @@ -212,14 +234,18 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. It gets global indexes (not batch-wide!) */ -template bool> -void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, cudaStream_t stream, Lambda filter_op) { +template bool> +void weak_cc(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + cudaStream_t stream, + Lambda filter_op) +{ rmm::device_scalar m(stream); WeakCCState state(m.data()); - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, - stream, filter_op); + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op); } /** @@ -244,12 +270,17 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, * @param stream the cuda stream to use */ template -void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr, - Index_ nnz, Index_ N, cudaStream_t stream) { +void weak_cc(Index_* labels, + const Index_* row_ind, + const Index_* row_ind_ptr, + Index_ nnz, + Index_ N, + cudaStream_t stream) +{ rmm::device_scalar m(stream); WeakCCState state(m.data()); - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, 0, N, - stream, [](Index_) { return true; }); + weak_cc_batched( + labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; }); } }; // namespace sparse diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h index d072100672..29a244a962 100644 --- a/cpp/include/raft/sparse/cusparse_wrappers.h +++ b/cpp/include/raft/sparse/cusparse_wrappers.h @@ -23,10 +23,9 @@ //#include #define _CUSPARSE_ERR_TO_STR(err) \ - case err: \ - return #err; + case err: return #err; -//Notes: +// Notes: //(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic; //(2.) to enforce a lower version, // @@ -43,16 +42,15 @@ namespace raft { * @brief Exception thrown when a cuSparse error is encountered. */ struct cusparse_error : public raft::exception { - explicit cusparse_error(char const* const message) - : raft::exception(message) {} - explicit cusparse_error(std::string const& message) - : raft::exception(message) {} + explicit cusparse_error(char const* const message) : raft::exception(message) {} + explicit cusparse_error(std::string const& message) : raft::exception(message) {} }; namespace sparse { namespace detail { -inline const char* cusparse_error_to_string(cusparseStatus_t err) { +inline const char* cusparse_error_to_string(cusparseStatus_t err) +{ #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100 return cusparseGetErrorString(err); #else // CUDART_VERSION @@ -65,8 +63,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR); _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); - default: - return "CUSPARSE_STATUS_UNKNOWN"; + default: return "CUSPARSE_STATUS_UNKNOWN"; }; #endif // CUDART_VERSION } @@ -88,8 +85,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { cusparseStatus_t const status = (call); \ if (CUSPARSE_STATUS_SUCCESS != status) { \ std::string msg{}; \ - SET_ERROR_MSG(msg, "cuSparse error encountered at: ", \ - "call='%s', Reason=%d:%s", #call, status, \ + SET_ERROR_MSG(msg, \ + "cuSparse error encountered at: ", \ + "call='%s', Reason=%d:%s", \ + #call, \ + status, \ raft::sparse::detail::cusparse_error_to_string(status)); \ throw raft::cusparse_error(msg); \ } \ @@ -100,13 +100,15 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) { //@todo: use logger here once logging is enabled /** check for cusparse runtime API errors but do not assert */ -#define CUSPARSE_CHECK_NO_THROW(call) \ - do { \ - cusparseStatus_t err = call; \ - if (err != CUSPARSE_STATUS_SUCCESS) { \ - printf("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \ - raft::sparse::detail::cusparse_error_to_string(err)); \ - } \ +#define CUSPARSE_CHECK_NO_THROW(call) \ + do { \ + cusparseStatus_t err = call; \ + if (err != CUSPARSE_STATUS_SUCCESS) { \ + printf("CUSPARSE call='%s' got errorcode=%d err=%s", \ + #call, \ + err, \ + raft::sparse::detail::cusparse_error_to_string(err)); \ + } \ } while (0) namespace raft { @@ -117,28 +119,34 @@ namespace sparse { * @{ */ template -cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals, - T* vals_sorted, int* d_P, cudaStream_t stream); +cusparseStatus_t cusparsegthr( + cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, - const double* vals, double* vals_sorted, - int* d_P, cudaStream_t stream) { +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, + int nnz, + const double* vals, + double* vals_sorted, + int* d_P, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, - CUSPARSE_INDEX_BASE_ZERO); + return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, - const float* vals, float* vals_sorted, - int* d_P, cudaStream_t stream) { +inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, + int nnz, + const float* vals, + float* vals_sorted, + int* d_P, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, - CUSPARSE_INDEX_BASE_ZERO); + return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO); #pragma GCC diagnostic pop } /** @} */ @@ -148,15 +156,18 @@ inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, * @{ */ template -void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz, - int m, T* csrRowPtr, cudaStream_t stream); +void cusparsecoo2csr( + cusparseHandle_t handle, const T* cooRowInd, int nnz, int m, T* csrRowPtr, cudaStream_t stream); template <> -inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, - int nnz, int m, int* csrRowPtr, - cudaStream_t stream) { +inline void cusparsecoo2csr(cusparseHandle_t handle, + const int* cooRowInd, + int nnz, + int m, + int* csrRowPtr, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, - CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -166,30 +177,54 @@ inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd, */ template size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows, - const T* cooCols, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* cooRows, + const T* cooCols, + cudaStream_t stream); template <> inline size_t cusparsecoosort_bufferSizeExt( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows, - const int* cooCols, cudaStream_t stream) { + cusparseHandle_t handle, + int m, + int n, + int nnz, + const int* cooRows, + const int* cooCols, + cudaStream_t stream) +{ size_t val; CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK( - cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); + CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val)); return val; } template void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P, - void* pBuffer, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int nnz, + T* cooRows, + T* cooCols, + T* P, + void* pBuffer, + cudaStream_t stream); template <> inline void cusparsecoosortByRow( // NOLINT - cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols, - int* P, void* pBuffer, cudaStream_t stream) { + cusparseHandle_t handle, + int m, + int n, + int nnz, + int* cooRows, + int* cooCols, + int* P, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK( - cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); + CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer)); } /** @} */ @@ -199,37 +234,67 @@ inline void cusparsecoosortByRow( // NOLINT */ template cusparseStatus_t cusparsegemmi( // NOLINT - cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha, - const T* A, int lda, const T* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream); + cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const T* alpha, + const T* A, + int lda, + const T* cscValB, + const int* cscColPtrB, + const int* cscRowIndB, + const T* beta, + T* C, + int ldc, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, - int k, int nnz, const float* alpha, - const float* A, int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const float* alpha, + const float* A, + int lda, const float* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const float* beta, - float* C, int ldc, cudaStream_t stream) { + const int* cscRowIndB, + const float* beta, + float* C, + int ldc, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, - cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseSgemmi( + handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, - int k, int nnz, const double* alpha, - const double* A, int lda, +inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const double* alpha, + const double* A, + int lda, const double* cscValB, const int* cscColPtrB, - const int* cscRowIndB, const double* beta, - double* C, int ldc, cudaStream_t stream) { + const int* cscRowIndB, + const double* beta, + double* C, + int ldc, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, - cscColPtrB, cscRowIndB, beta, C, ldc); + return cusparseDgemmi( + handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc); #pragma GCC diagnostic pop } /** @} */ @@ -241,49 +306,94 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, */ template cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, int64_t nnz, - IndexT* csrRowOffsets, IndexT* csrColInd, + int64_t rows, + int64_t cols, + int64_t nnz, + IndexT* csrRowOffsets, + IndexT* csrColInd, ValueT* csrValues); template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int* csrRowOffsets, - int* csrColInd, float* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, + int64_t cols, + int64_t nnz, + int* csrRowOffsets, + int* csrColInd, + float* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int* csrRowOffsets, - int* csrColInd, double* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + int64_t rows, + int64_t cols, + int64_t nnz, + int* csrRowOffsets, + int* csrColInd, + double* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int64_t* csrRowOffsets, + int64_t rows, + int64_t cols, + int64_t nnz, + int64_t* csrRowOffsets, int64_t* csrColInd, - float* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + float* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, - int64_t rows, int64_t cols, - int64_t nnz, int64_t* csrRowOffsets, + int64_t rows, + int64_t cols, + int64_t nnz, + int64_t* csrRowOffsets, int64_t* csrColInd, - double* csrValues) { - return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets, - csrColInd, csrValues, CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, + double* csrValues) +{ + return cusparseCreateCsr(spMatDescr, + rows, + cols, + nnz, + csrRowOffsets, + csrColInd, + csrValues, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); } /** @} */ @@ -292,16 +402,19 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr, * @{ */ template -cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, T* values); +cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, int64_t size, T* values); template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, float* values) { + int64_t size, + float* values) +{ return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F); } template <> inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, - int64_t size, double* values) { + int64_t size, + double* values) +{ return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F); } /** @} */ @@ -312,23 +425,30 @@ inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, */ template cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, int64_t ld, - T* values, cusparseOrder_t order); + int64_t rows, + int64_t cols, + int64_t ld, + T* values, + cusparseOrder_t order); template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, - int64_t ld, float* values, - cusparseOrder_t order) { - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, - order); + int64_t rows, + int64_t cols, + int64_t ld, + float* values, + cusparseOrder_t order) +{ + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, order); } template <> inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, - int64_t rows, int64_t cols, - int64_t ld, double* values, - cusparseOrder_t order) { - return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, - order); + int64_t rows, + int64_t cols, + int64_t ld, + double* values, + cusparseOrder_t order) +{ + return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order); } /** @} */ @@ -337,58 +457,89 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr, * @{ */ template -cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const T* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const float* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - CUDA_R_32F, alg, bufferSize); + return cusparseSpMV_bufferSize( + handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmv_buffersize( - cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle, + cusparseOperation_t opA, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const double* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, - CUDA_R_64F, alg, bufferSize); + return cusparseSpMV_bufferSize( + handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, bufferSize); } template -cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnVecDescr_t vecX, const T* beta, +cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const T* beta, const cusparseDnVecDescr_t vecY, - cusparseSpMVAlg_t alg, T* externalBuffer, + cusparseSpMVAlg_t alg, + T* externalBuffer, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - float* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const float* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + float* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, - alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmv( - cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha, - const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX, - const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, - double* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle, + cusparseOperation_t opA, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const double* beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, + double* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, - alg, externalBuffer); + return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer); } /** @} */ #else @@ -398,29 +549,59 @@ inline cusparseStatus_t cusparsespmv( */ template cusparseStatus_t cusparsecsrmv( // NOLINT - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y, + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const T* beta, + T* y, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmv( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const float* alpha, const cusparseMatDescr_t descr, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta, - float* y, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const float* beta, + float* y, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, beta, y); + return cusparseScsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); } template <> -inline cusparseStatus_t cusparsecsrmv( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz, - const double* alpha, const cusparseMatDescr_t descr, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, const double* x, - const double* beta, double* y, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const double* beta, + double* y, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, beta, y); + return cusparseDcsrmv( + handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y); } /** @} */ #endif @@ -431,58 +612,96 @@ inline cusparseStatus_t cusparsecsrmv( * @{ */ template -cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const float* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const float* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, CUDA_R_32F, alg, bufferSize); + return cusparseSpMM_bufferSize( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, bufferSize); } template <> -inline cusparseStatus_t cusparsespmm_bufferSize( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const double* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const double* beta, - cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize, - cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const double* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta, - matC, CUDA_R_64F, alg, bufferSize); + return cusparseSpMM_bufferSize( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, bufferSize); } template -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const T* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream); +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const T* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const T* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + T* externalBuffer, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const float* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC, - cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const float* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const float* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + float* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - CUDA_R_32F, alg, externalBuffer); + return cusparseSpMM( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, externalBuffer); } template <> -inline cusparseStatus_t cusparsespmm( - cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB, - const double* alpha, const cusparseSpMatDescr_t matA, - const cusparseDnMatDescr_t matB, const double* beta, - cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseOperation_t opB, + const double* alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnMatDescr_t matB, + const double* beta, + cusparseDnMatDescr_t matC, + cusparseSpMMAlg_t alg, + double* externalBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC, - CUDA_R_64F, alg, externalBuffer); + return cusparseSpMM( + handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, externalBuffer); } /** @} */ #else @@ -492,31 +711,68 @@ inline cusparseStatus_t cusparsespmm( */ template cusparseStatus_t cusparsecsrmm( // NOLINT - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx, - const T* beta, T* y, const int ldy, cudaStream_t stream); + cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const T* alpha, + const cusparseMatDescr_t descr, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const T* x, + const int ldx, + const T* beta, + T* y, + const int ldy, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsrmm( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const float* alpha, const cusparseMatDescr_t descr, - const float* csrVal, const int* csrRowPtr, const int* csrColInd, - const float* x, const int ldx, const float* beta, float* y, const int ldy, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const float* alpha, + const cusparseMatDescr_t descr, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const float* x, + const int ldx, + const float* beta, + float* y, + const int ldy, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseScsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } template <> -inline cusparseStatus_t cusparsecsrmm( - cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k, - int nnz, const double* alpha, const cusparseMatDescr_t descr, - const double* csrVal, const int* csrRowPtr, const int* csrColInd, - const double* x, const int ldx, const double* beta, double* y, const int ldy, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + int n, + int k, + int nnz, + const double* alpha, + const cusparseMatDescr_t descr, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + const double* x, + const int ldx, + const double* beta, + double* y, + const int ldy, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal, - csrRowPtr, csrColInd, x, ldx, beta, y, ldy); + return cusparseDcsrmm( + handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy); } /** @} */ #endif @@ -527,15 +783,22 @@ inline cusparseStatus_t cusparsecsrmm( */ template void cusparsecsr2coo( // NOLINT - cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr, - T* cooRowInd, cudaStream_t stream); + cusparseHandle_t handle, + const int n, + const int nnz, + const T* csrRowPtr, + T* cooRowInd, + cudaStream_t stream); template <> -inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, - const int* csrRowPtr, int* cooRowInd, - cudaStream_t stream) { +inline void cusparsecsr2coo(cusparseHandle_t handle, + const int n, + const int nnz, + const int* csrRowPtr, + int* cooRowInd, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, - CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO)); } /** @} */ @@ -553,7 +816,8 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz, // template<> inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, cusparsePointerMode_t mode, - cudaStream_t stream) { + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); return cusparseSetPointerMode(handle, mode); } @@ -564,69 +828,203 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle, * @{ */ template -cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, - const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, - const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const float* x, const float* beta, float* y, size_t* bufferSizeInBytes, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize( - handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA, - CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y, - CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes); -} -template <> -inline cusparseStatus_t cusparsecsrmvex_bufferSize( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, - const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const double* x, const double* beta, double* y, size_t* bufferSizeInBytes, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx_bufferSize( - handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA, - CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y, - CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes); +cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* x, + const T* beta, + T* y, + size_t* bufferSizeInBytes, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* x, + const float* beta, + float* y, + size_t* bufferSizeInBytes, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_32F, + descrA, + csrValA, + CUDA_R_32F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_32F, + beta, + CUDA_R_32F, + y, + CUDA_R_32F, + CUDA_R_32F, + bufferSizeInBytes); +} +template <> +inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* x, + const double* beta, + double* y, + size_t* bufferSizeInBytes, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx_bufferSize(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_64F, + descrA, + csrValA, + CUDA_R_64F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_64F, + beta, + CUDA_R_64F, + y, + CUDA_R_64F, + CUDA_R_64F, + bufferSizeInBytes); } template -cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x, - const T* beta, T* y, T* buffer, cudaStream_t stream); -template <> -inline cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, - const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const float* x, const float* beta, float* y, float* buffer, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, - descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x, - CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F, - CUDA_R_32F, buffer); -} -template <> -inline cusparseStatus_t cusparsecsrmvex( - cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, - int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA, - const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, - const double* x, const double* beta, double* y, double* buffer, - cudaStream_t stream) { - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, - descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x, - CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F, - CUDA_R_64F, buffer); +cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const T* alpha, + const cusparseMatDescr_t descrA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const T* x, + const T* beta, + T* y, + T* buffer, + cudaStream_t stream); +template <> +inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float* alpha, + const cusparseMatDescr_t descrA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const float* x, + const float* beta, + float* y, + float* buffer, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_32F, + descrA, + csrValA, + CUDA_R_32F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_32F, + beta, + CUDA_R_32F, + y, + CUDA_R_32F, + CUDA_R_32F, + buffer); +} +template <> +inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double* alpha, + const cusparseMatDescr_t descrA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const double* x, + const double* beta, + double* y, + double* buffer, + cudaStream_t stream) +{ + CUSPARSE_CHECK(cusparseSetStream(handle, stream)); + return cusparseCsrmvEx(handle, + alg, + transA, + m, + n, + nnz, + alpha, + CUDA_R_64F, + descrA, + csrValA, + CUDA_R_64F, + csrRowPtrA, + csrColIndA, + x, + CUDA_R_64F, + beta, + CUDA_R_64F, + y, + CUDA_R_64F, + CUDA_R_64F, + buffer); } /** @} */ @@ -637,68 +1035,180 @@ inline cusparseStatus_t cusparsecsrmvex( */ template -cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize( - handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, - cscRowInd, CUDA_R_32F, copyValues, idxBase, alg, bufferSize); + return cusparseCsr2cscEx2_bufferSize(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_32F, + copyValues, + idxBase, + alg, + bufferSize); } template <> -inline cusparseStatus_t cusparsecsr2csc_bufferSize( - cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle, + int m, + int n, + int nnz, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + size_t* bufferSize, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2_bufferSize( - handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, - cscRowInd, CUDA_R_64F, copyValues, idxBase, alg, bufferSize); + return cusparseCsr2cscEx2_bufferSize(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_64F, + copyValues, + idxBase, + alg, + bufferSize); } template -cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream); +cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const T* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const float* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, - cscVal, cscColPtr, cscRowInd, CUDA_R_32F, - copyValues, idxBase, alg, buffer); + return cusparseCsr2cscEx2(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_32F, + copyValues, + idxBase, + alg, + buffer); } template <> -inline cusparseStatus_t cusparsecsr2csc( - cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal, - const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, - int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, - cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) { +inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const double* csrVal, + const int* csrRowPtr, + const int* csrColInd, + void* cscVal, + int* cscColPtr, + int* cscRowInd, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, + void* buffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, - cscVal, cscColPtr, cscRowInd, CUDA_R_64F, - copyValues, idxBase, alg, buffer); + return cusparseCsr2cscEx2(handle, + m, + n, + nnz, + csrVal, + csrRowPtr, + csrColInd, + cscVal, + cscColPtr, + cscRowInd, + CUDA_R_64F, + copyValues, + idxBase, + alg, + buffer); } /** @} */ @@ -709,120 +1219,329 @@ inline cusparseStatus_t cusparsecsr2csc( */ template -cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const T* alpha, const T* beta, - const cusparseMatDescr_t matA, int nnzA, const int* rowindA, - const int* indicesA, const cusparseMatDescr_t matB, int nnzB, - const int* rowindB, const int* indicesB, const cusparseMatDescr_t matD, - int nnzD, const int* rowindD, const int* indicesD, csrgemm2Info_t info, - size_t* pBufferSizeInBytes, cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const float* alpha, - const float* beta, const cusparseMatDescr_t matA, int nnzA, - const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, - int nnzB, const int* rowindB, const int* indicesB, - const cusparseMatDescr_t matD, int nnzD, const int* rowindD, - const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, - cudaStream_t stream) { +cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const T* alpha, + const T* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const float* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2_bufferSizeExt( - handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, - indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); + return cusparseScsrgemm2_bufferSizeExt(handle, + m, + n, + k, + alpha, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + beta, + matD, + nnzD, + rowindD, + indicesD, + info, + pBufferSizeInBytes); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2_buffersizeext( - cusparseHandle_t handle, int m, int n, int k, const double* alpha, - const double* beta, const cusparseMatDescr_t matA, int nnzA, - const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB, - int nnzB, const int* rowindB, const int* indicesB, - const cusparseMatDescr_t matD, int nnzD, const int* rowindD, - const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const double* beta, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + csrgemm2Info_t info, + size_t* pBufferSizeInBytes, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2_bufferSizeExt( - handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB, - indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes); + return cusparseDcsrgemm2_bufferSizeExt(handle, + m, + n, + k, + alpha, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + beta, + matD, + nnzD, + rowindD, + indicesD, + info, + pBufferSizeInBytes); #pragma GCC diagnostic pop } -inline cusparseStatus_t cusparsecsrgemm2nnz( - cusparseHandle_t handle, int m, int n, int k, const cusparseMatDescr_t matA, - int nnzA, const int* rowindA, const int* indicesA, - const cusparseMatDescr_t matB, int nnzB, const int* rowindB, - const int* indicesB, const cusparseMatDescr_t matD, int nnzD, - const int* rowindD, const int* indicesD, const cusparseMatDescr_t matC, - int* rowindC, int* nnzC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2nnz(cusparseHandle_t handle, + int m, + int n, + int k, + const cusparseMatDescr_t matA, + int nnzA, + const int* rowindA, + const int* indicesA, + const cusparseMatDescr_t matB, + int nnzB, + const int* rowindB, + const int* indicesB, + const cusparseMatDescr_t matD, + int nnzD, + const int* rowindD, + const int* indicesD, + const cusparseMatDescr_t matC, + int* rowindC, + int* nnzC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseXcsrgemm2Nnz(handle, m, n, k, matA, nnzA, rowindA, indicesA, - matB, nnzB, rowindB, indicesB, matD, nnzD, - rowindD, indicesD, matC, rowindC, nnzC, info, + return cusparseXcsrgemm2Nnz(handle, + m, + n, + k, + matA, + nnzA, + rowindA, + indicesA, + matB, + nnzB, + rowindB, + indicesB, + matD, + nnzD, + rowindD, + indicesD, + matC, + rowindC, + nnzC, + info, pBuffer); #pragma GCC diagnostic pop } template -cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const T* alpha, - const cusparseMatDescr_t descrA, int nnzA, const T* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const T* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const T* beta, const cusparseMatDescr_t descrD, int nnzD, const T* csrValD, - const int* csrRowPtrD, const int* csrColIndD, const cusparseMatDescr_t descrC, - T* csrValC, const int* csrRowPtrC, int* csrColIndC, const csrgemm2Info_t info, - void* pBuffer, cudaStream_t stream); - -template <> -inline cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const float* alpha, - const cusparseMatDescr_t descrA, int nnzA, const float* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const float* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const float* beta, const cusparseMatDescr_t descrD, int nnzD, - const float* csrValD, const int* csrRowPtrD, const int* csrColIndD, - const cusparseMatDescr_t descrC, float* csrValC, const int* csrRowPtrC, - int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const T* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const T* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const T* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const T* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + T* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream); + +template <> +inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const float* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const float* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const float* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const float* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const float* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + float* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseScsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, - csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, - csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, - csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, - csrColIndC, info, pBuffer); + return cusparseScsrgemm2(handle, + m, + n, + k, + alpha, + descrA, + nnzA, + csrValA, + csrRowPtrA, + csrColIndA, + descrB, + nnzB, + csrValB, + csrRowPtrB, + csrColIndB, + beta, + descrD, + nnzD, + csrValD, + csrRowPtrD, + csrColIndD, + descrC, + csrValC, + csrRowPtrC, + csrColIndC, + info, + pBuffer); #pragma GCC diagnostic pop } template <> -inline cusparseStatus_t cusparsecsrgemm2( - cusparseHandle_t handle, int m, int n, int k, const double* alpha, - const cusparseMatDescr_t descrA, int nnzA, const double* csrValA, - const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB, - int nnzB, const double* csrValB, const int* csrRowPtrB, const int* csrColIndB, - const double* beta, const cusparseMatDescr_t descrD, int nnzD, - const double* csrValD, const int* csrRowPtrD, const int* csrColIndD, - const cusparseMatDescr_t descrC, double* csrValC, const int* csrRowPtrC, - int* csrColIndC, const csrgemm2Info_t info, void* pBuffer, - cudaStream_t stream) { +inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const double* alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const double* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const double* csrValB, + const int* csrRowPtrB, + const int* csrColIndB, + const double* beta, + const cusparseMatDescr_t descrD, + int nnzD, + const double* csrValD, + const int* csrRowPtrD, + const int* csrColIndD, + const cusparseMatDescr_t descrC, + double* csrValC, + const int* csrRowPtrC, + int* csrColIndC, + const csrgemm2Info_t info, + void* pBuffer, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - return cusparseDcsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA, - csrRowPtrA, csrColIndA, descrB, nnzB, csrValB, - csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD, - csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC, - csrColIndC, info, pBuffer); + return cusparseDcsrgemm2(handle, + m, + n, + k, + alpha, + descrA, + nnzA, + csrValA, + csrRowPtrA, + csrColIndA, + descrB, + nnzB, + csrValB, + csrRowPtrB, + csrColIndB, + beta, + descrD, + nnzD, + csrValD, + csrRowPtrD, + csrColIndD, + descrC, + csrValC, + csrRowPtrC, + csrColIndC, + info, + pBuffer); #pragma GCC diagnostic pop } @@ -834,33 +1553,46 @@ inline cusparseStatus_t cusparsecsrgemm2( */ template -cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, - const T* csrValA, const int* csrRowPtrA, - const int* csrColIndA, T* A, int lda, + const T* csrValA, + const int* csrRowPtrA, + const int* csrColIndA, + T* A, + int lda, cudaStream_t stream); template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, const float* csrValA, const int* csrRowPtrA, - const int* csrColIndA, float* A, - int lda, cudaStream_t stream) { + const int* csrColIndA, + float* A, + int lda, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, - csrColIndA, A, lda); + return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); } template <> -inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n, +inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, + int m, + int n, const cusparseMatDescr_t descrA, const double* csrValA, const int* csrRowPtrA, - const int* csrColIndA, double* A, - int lda, cudaStream_t stream) { + const int* csrColIndA, + double* A, + int lda, + cudaStream_t stream) +{ CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, - csrColIndA, A, lda); + return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda); } /** @} */ diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h index 1c55412eec..29c823bcdb 100644 --- a/cpp/include/raft/sparse/distance/common.h +++ b/cpp/include/raft/sparse/distance/common.h @@ -24,31 +24,31 @@ namespace distance { template struct distances_config_t { - distances_config_t(const raft::handle_t &handle_) : handle(handle_) {} + distances_config_t(const raft::handle_t& handle_) : handle(handle_) {} // left side value_idx a_nrows; value_idx a_ncols; value_idx a_nnz; - value_idx *a_indptr; - value_idx *a_indices; - value_t *a_data; + value_idx* a_indptr; + value_idx* a_indices; + value_t* a_data; // right side value_idx b_nrows; value_idx b_ncols; value_idx b_nnz; - value_idx *b_indptr; - value_idx *b_indices; - value_t *b_data; + value_idx* b_indptr; + value_idx* b_indices; + value_t* b_data; - const raft::handle_t &handle; + const raft::handle_t& handle; }; template class distances_t { public: - virtual void compute(value_t *out) {} + virtual void compute(value_t* out) {} virtual ~distances_t() = default; }; diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh index 3f8c32a20b..4d3b31df9a 100644 --- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh @@ -35,9 +35,11 @@ namespace distance { namespace detail { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_binary_row_norm_kernel( - value_t *out, const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, value_idx nnz) { +__global__ void compute_binary_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; if (i < nnz) { // We do conditional here only because it's @@ -49,54 +51,63 @@ __global__ void compute_binary_row_norm_kernel( } template -__global__ void compute_binary_warp_kernel(value_t *__restrict__ C, - const value_t *__restrict__ Q_norms, - const value_t *__restrict__ R_norms, - value_idx n_rows, value_idx n_cols, - expansion_f expansion_func) { +__global__ void compute_binary_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t q_norm = Q_norms[i]; - value_t r_norm = R_norms[j]; - value_t dot = C[(size_t)i * n_cols + j]; + value_t q_norm = Q_norms[i]; + value_t r_norm = R_norms[j]; + value_t dot = C[(size_t)i * n_cols + j]; C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm); } -template -void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms, - value_idx n_rows, value_idx n_cols, - expansion_f expansion_func, cudaStream_t stream) { +template +void compute_binary(value_t* C, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func, + cudaStream_t stream) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_binary_warp_kernel<<>>( C, Q_norms, R_norms, n_rows, n_cols, expansion_func); } -template -void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, - cudaStream_t stream, expansion_f expansion_func) { +template +void compute_bin_distance(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + cudaStream_t stream, + expansion_f expansion_func) +{ rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_binary_row_norm_kernel<<>>( Q_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_binary_row_norm_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, - stream); + compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream); } /** @@ -106,44 +117,51 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows, template class jaccard_expanded_distances_t : public distances_t { public: - explicit jaccard_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit jaccard_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t denom = q_r_union - dot; - - value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); - - // flip the similarity when both rows are 0 - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * jacc) + both_empty); - }); + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t denom = q_r_union - dot; + + value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom); + + // flip the similarity when both rows are 0 + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * jacc) + both_empty); + }); } ~jaccard_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -155,40 +173,47 @@ class jaccard_expanded_distances_t : public distances_t { template class dice_expanded_distances_t : public distances_t { public: - explicit dice_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit dice_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_bin_distance( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t q_r_union = q_norm + r_norm; - value_t dice = (2 * dot) / q_r_union; - bool both_empty = q_r_union == 0; - return 1 - ((!both_empty * dice) + both_empty); - }); + compute_bin_distance(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t q_r_union = q_norm + r_norm; + value_t dice = (2 * dot) / q_r_union; + bool both_empty = q_r_union == 0; + return 1 - ((!both_empty * dice) + both_empty); + }); } ~dice_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh index 83844b8c54..6694d0fc4f 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh @@ -39,19 +39,29 @@ namespace sparse { namespace distance { namespace detail { -template inline void balanced_coo_pairwise_generalized_spmv( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_b, product_f product_func, accum_f accum_func, - write_f write_func, strategy_t strategy, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync( - out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); - - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, - chunk_size); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + CUDA_CHECK(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); + + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); }; /** @@ -87,39 +97,55 @@ inline void balanced_coo_pairwise_generalized_spmv( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_b, product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size = 500000) { - CUDA_CHECK(cudaMemsetAsync( - out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows, - config_.handle.get_stream())); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ + CUDA_CHECK(cudaMemsetAsync(out_dists, + 0, + sizeof(value_t) * config_.a_nrows * config_.b_nrows, + config_.handle.get_stream())); int max_cols = max_cols_per_block(); if (max_cols > config_.a_ncols) { - dense_smem_strategy strategy( - config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, - write_func, chunk_size); + dense_smem_strategy strategy(config_); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, - write_func, chunk_size); + strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size); } }; -template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_a, product_f product_func, accum_f accum_func, - write_f write_func, strategy_t strategy, int chunk_size = 500000) { - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + strategy_t strategy, + int chunk_size = 500000) +{ + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); }; /** @@ -158,24 +184,30 @@ inline void balanced_coo_pairwise_generalized_spmv_rev( * this value was found through profiling and represents a reasonable * setting for both large and small densities */ -template +template inline void balanced_coo_pairwise_generalized_spmv_rev( - value_t *out_dists, const distances_config_t &config_, - value_idx *coo_rows_a, product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size = 500000) { + value_t* out_dists, + const distances_config_t& config_, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size = 500000) +{ // try dense first int max_cols = max_cols_per_block(); if (max_cols > config_.b_ncols) { - dense_smem_strategy strategy( - config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + dense_smem_strategy strategy(config_); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); } else { hash_strategy strategy(config_); - strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, - write_func, chunk_size); + strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size); } }; diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh index 866ff43224..9bfdd3bad0 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh @@ -27,68 +27,88 @@ namespace sparse { namespace distance { namespace detail { /** - * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with - * sparse-matrix-sparse-vector multiplication layout (SPMV). - * This is intended to be scheduled n_chunks_b times for each row of a. - * The steps are as follows: - * - * 1. Load row from A into dense vector in shared memory. - * This can be further chunked in the future if necessary to support larger - * column sizes. - * 2. Threads of block all step through chunks of B in parallel. - * When a new row is encountered in row_indices_b, a segmented - * reduction is performed across the warps and then across the - * block and the final value written out to host memory. - * - * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf - * - * @tparam value_idx index type - * @tparam value_t value type - * @tparam tpb threads per block configured on launch - * @tparam rev if this is true, the reduce/accumulate functions are only - * executed when A[col] == 0.0. when executed before/after !rev - * and A & B are reversed, this allows the full symmetric difference - * and intersection to be computed. - * @tparam kv_t data type stored in shared mem cache - * @tparam product_f reduce function type (semiring product() function). - * accepts two arguments of value_t and returns a value_t - * @tparam accum_f accumulation function type (semiring sum() function). - * accepts two arguments of value_t and returns a value_t - * @tparam write_f function to write value out. this should be mathematically - * equivalent to the accumulate function but implemented as - * an atomic operation on global memory. Accepts two arguments - * of value_t* and value_t and updates the value given by the - * pointer. - * @param[in] indptrA column pointer array for A - * @param[in] indicesA column indices array for A - * @param[in] dataA data array for A - * @param[in] rowsB coo row array for B - * @param[in] indicesB column indices array for B - * @param[in] dataB data array for B - * @param[in] m number of rows in A - * @param[in] n number of rows in B - * @param[in] dim number of features - * @param[in] nnz_b number of nonzeros in B - * @param[out] out array of size m*n - * @param[in] n_blocks_per_row number of blocks of B per row of A - * @param[in] chunk_size number of nnz for B to use for each row of A - * @param[in] buffer_size amount of smem to use for each row of A - * @param[in] product_func semiring product() function - * @param[in] accum_func semiring sum() function - * @param[in] write_func atomic semiring sum() function - */ -template -__global__ void balanced_coo_generalized_spmv_kernel( - strategy_t strategy, indptr_it indptrA, value_idx *indicesA, value_t *dataA, - value_idx nnz_a, value_idx *rowsB, value_idx *indicesB, value_t *dataB, - value_idx m, value_idx n, int dim, value_idx nnz_b, value_t *out, - int n_blocks_per_row, int chunk_size, value_idx b_ncols, - product_f product_func, accum_f accum_func, write_f write_func) { + * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with + * sparse-matrix-sparse-vector multiplication layout (SPMV). + * This is intended to be scheduled n_chunks_b times for each row of a. + * The steps are as follows: + * + * 1. Load row from A into dense vector in shared memory. + * This can be further chunked in the future if necessary to support larger + * column sizes. + * 2. Threads of block all step through chunks of B in parallel. + * When a new row is encountered in row_indices_b, a segmented + * reduction is performed across the warps and then across the + * block and the final value written out to host memory. + * + * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf + * + * @tparam value_idx index type + * @tparam value_t value type + * @tparam tpb threads per block configured on launch + * @tparam rev if this is true, the reduce/accumulate functions are only + * executed when A[col] == 0.0. when executed before/after !rev + * and A & B are reversed, this allows the full symmetric difference + * and intersection to be computed. + * @tparam kv_t data type stored in shared mem cache + * @tparam product_f reduce function type (semiring product() function). + * accepts two arguments of value_t and returns a value_t + * @tparam accum_f accumulation function type (semiring sum() function). + * accepts two arguments of value_t and returns a value_t + * @tparam write_f function to write value out. this should be mathematically + * equivalent to the accumulate function but implemented as + * an atomic operation on global memory. Accepts two arguments + * of value_t* and value_t and updates the value given by the + * pointer. + * @param[in] indptrA column pointer array for A + * @param[in] indicesA column indices array for A + * @param[in] dataA data array for A + * @param[in] rowsB coo row array for B + * @param[in] indicesB column indices array for B + * @param[in] dataB data array for B + * @param[in] m number of rows in A + * @param[in] n number of rows in B + * @param[in] dim number of features + * @param[in] nnz_b number of nonzeros in B + * @param[out] out array of size m*n + * @param[in] n_blocks_per_row number of blocks of B per row of A + * @param[in] chunk_size number of nnz for B to use for each row of A + * @param[in] buffer_size amount of smem to use for each row of A + * @param[in] product_func semiring product() function + * @param[in] accum_func semiring sum() function + * @param[in] write_func atomic semiring sum() function + */ +template +__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy, + indptr_it indptrA, + value_idx* indicesA, + value_t* dataA, + value_idx nnz_a, + value_idx* rowsB, + value_idx* indicesB, + value_t* dataB, + value_idx m, + value_idx n, + int dim, + value_idx nnz_b, + value_t* out, + int n_blocks_per_row, + int chunk_size, + value_idx b_ncols, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ typedef cub::WarpReduce warp_reduce; - value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); + value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row); value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row; // chunk starting offset @@ -96,18 +116,17 @@ __global__ void balanced_coo_generalized_spmv_kernel( // how many total cols will be processed by this block (should be <= chunk_size * n_threads) value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset); - int tid = threadIdx.x; + int tid = threadIdx.x; int warp_id = tid / raft::warp_size(); // compute id relative to current warp unsigned int lane_id = tid & (raft::warp_size() - 1); - value_idx ind = ind_offset + threadIdx.x; + value_idx ind = ind_offset + threadIdx.x; extern __shared__ char smem[]; - typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); - typename warp_reduce::TempStorage *temp_storage = - (typename warp_reduce::TempStorage *)(A + dim); + typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem); + typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim); auto inserter = strategy.init_insert(A, dim); @@ -115,13 +134,12 @@ __global__ void balanced_coo_generalized_spmv_kernel( value_idx start_offset_a, stop_offset_a; bool first_a_chunk, last_a_chunk; - indptrA.get_row_offsets(cur_row_a, start_offset_a, stop_offset_a, - n_blocks_per_row, first_a_chunk, last_a_chunk); + indptrA.get_row_offsets( + cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk); // Convert current row vector in A to dense for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) { - strategy.insert(inserter, indicesA[start_offset_a + i], - dataA[start_offset_a + i]); + strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]); } __syncthreads(); @@ -132,34 +150,36 @@ __global__ void balanced_coo_generalized_spmv_kernel( if (ind >= nnz_b) return; value_idx start_index_a = 0, stop_index_a = b_ncols - 1; - indptrA.get_indices_boundary(indicesA, cur_row_a, start_offset_a, - stop_offset_a, start_index_a, stop_index_a, - first_a_chunk, last_a_chunk); + indptrA.get_indices_boundary(indicesA, + cur_row_a, + start_offset_a, + stop_offset_a, + start_index_a, + stop_index_a, + first_a_chunk, + last_a_chunk); value_idx cur_row_b = -1; - value_t c = 0.0; + value_t c = 0.0; auto warp_red = warp_reduce(*(temp_storage + warp_id)); if (tid < active_chunk_size) { cur_row_b = rowsB[ind]; - auto index_b = indicesB[ind]; - auto in_bounds = - indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { - c = product_func(a_col, dataB[ind]); - } + if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); } } } // loop through chunks in parallel, reducing when a new row is // encountered by each thread for (int i = tid; i < active_chunk_size; i += blockDim.x) { - value_idx ind_next = ind + blockDim.x; + value_idx ind_next = ind + blockDim.x; value_idx next_row_b = -1; if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next]; @@ -170,14 +190,13 @@ __global__ void balanced_coo_generalized_spmv_kernel( // grab the threads currently participating in loops. // because any other threads should have returned already. unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b); - bool is_leader = get_lowest_peer(peer_group) == lane_id; - value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); + bool is_leader = get_lowest_peer(peer_group) == lane_id; + value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func); // thread with lowest lane id among peers writes out if (is_leader && v != 0.0) { // this conditional should be uniform, since rev is constant - size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b - : (size_t)cur_row_b * m + cur_row_a; + size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a; write_func(out + idx, v); } @@ -187,15 +206,12 @@ __global__ void balanced_coo_generalized_spmv_kernel( if (next_row_b != -1) { ind = ind_next; - auto index_b = indicesB[ind]; - auto in_bounds = - indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); + auto index_b = indicesB[ind]; + auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b); if (in_bounds) { value_t a_col = strategy.find(finder, index_b); - if (!rev || a_col == 0.0) { - c = accum_func(c, product_func(a_col, dataB[ind])); - } + if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); } } cur_row_b = next_row_b; diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh index 4ad3368c4a..9b1dfff022 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh @@ -31,58 +31,114 @@ namespace detail { template class coo_spmv_strategy { public: - coo_spmv_strategy(const distances_config_t &config_) - : config(config_) { + coo_spmv_strategy(const distances_config_t& config_) : config(config_) + { smem = raft::getSharedMemPerBlock(); } - template - void _dispatch_base(strategy_t &strategy, int smem_dim, indptr_it &a_indptr, - value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size, int n_blocks, - int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig( - balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base(strategy_t& strategy, + int smem_dim, + indptr_it& a_indptr, + value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>( - strategy, a_indptr, config.a_indices, config.a_data, config.a_nnz, - coo_rows_b, config.b_indices, config.b_data, config.a_nrows, - config.b_nrows, smem_dim, config.b_nnz, out_dists, n_blocks_per_row, - chunk_size, config.b_ncols, product_func, accum_func, write_func); + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + a_indptr, + config.a_indices, + config.a_data, + config.a_nnz, + coo_rows_b, + config.b_indices, + config.b_data, + config.a_nrows, + config.b_nrows, + smem_dim, + config.b_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.b_ncols, + product_func, + accum_func, + write_func); } - template - void _dispatch_base_rev(strategy_t &strategy, int smem_dim, - indptr_it &b_indptr, value_t *out_dists, - value_idx *coo_rows_a, product_f product_func, - accum_f accum_func, write_f write_func, - int chunk_size, int n_blocks, int n_blocks_per_row) { - CUDA_CHECK(cudaFuncSetCacheConfig( - balanced_coo_generalized_spmv_kernel, - cudaFuncCachePreferShared)); + template + void _dispatch_base_rev(strategy_t& strategy, + int smem_dim, + indptr_it& b_indptr, + value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size, + int n_blocks, + int n_blocks_per_row) + { + CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel, + cudaFuncCachePreferShared)); - balanced_coo_generalized_spmv_kernel - <<>>( - strategy, b_indptr, config.b_indices, config.b_data, config.b_nnz, - coo_rows_a, config.a_indices, config.a_data, config.b_nrows, - config.a_nrows, smem_dim, config.a_nnz, out_dists, n_blocks_per_row, - chunk_size, config.a_ncols, product_func, accum_func, write_func); + balanced_coo_generalized_spmv_kernel + <<>>(strategy, + b_indptr, + config.b_indices, + config.b_data, + config.b_nnz, + coo_rows_a, + config.a_indices, + config.a_data, + config.b_nrows, + config.a_nrows, + smem_dim, + config.a_nnz, + out_dists, + n_blocks_per_row, + chunk_size, + config.a_ncols, + product_func, + accum_func, + write_func); } protected: int smem; - const distances_config_t &config; + const distances_config_t& config; }; } // namespace detail diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh index 0ab7b65ac2..da51767307 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh @@ -29,11 +29,15 @@ namespace detail { template class mask_row_it { public: - mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, - value_idx *mask_row_idx_ = NULL) - : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) {} + mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_ = NULL) + : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) + { + } - __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { if (mask_row_idx != NULL) { return mask_row_idx[blockIdx.x / n_blocks_nnz_b]; } else { @@ -41,37 +45,49 @@ class mask_row_it { } } - __device__ inline void get_row_offsets( - const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, - const value_idx &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const value_idx& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { start_offset = full_indptr[row_idx]; - stop_offset = full_indptr[row_idx + 1] - 1; + stop_offset = full_indptr[row_idx + 1] - 1; } - __device__ constexpr inline void get_indices_boundary( - const value_idx *indices, value_idx &indices_len, value_idx &start_offset, - value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, - bool &first_a_chunk, bool &last_a_chunk) { + __device__ constexpr inline void get_indices_boundary(const value_idx* indices, + value_idx& indices_len, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { // do nothing; } - __device__ constexpr inline bool check_indices_bounds( - value_idx &start_index_a, value_idx &stop_index_a, value_idx &index_b) { + __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { return true; } const value_idx *full_indptr, &n_rows; - value_idx *mask_row_idx; + value_idx* mask_row_idx; }; template -__global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row, - value_idx *chunk_indices, - value_idx n_rows) { +__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row, + value_idx* chunk_indices, + value_idx n_rows) +{ auto tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < n_rows) { auto start = n_chunks_per_row[tid]; - auto end = n_chunks_per_row[tid + 1]; + auto end = n_chunks_per_row[tid + 1]; #pragma unroll for (int i = start; i < end; i++) { @@ -83,73 +99,89 @@ __global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row, template class chunked_mask_row_it : public mask_row_it { public: - chunked_mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_, - value_idx *mask_row_idx_, int row_chunk_size_, - const value_idx *n_chunks_per_row_, - const value_idx *chunk_indices_, + chunked_mask_row_it(const value_idx* full_indptr_, + const value_idx& n_rows_, + value_idx* mask_row_idx_, + int row_chunk_size_, + const value_idx* n_chunks_per_row_, + const value_idx* chunk_indices_, const cudaStream_t stream_) : mask_row_it(full_indptr_, n_rows_, mask_row_idx_), row_chunk_size(row_chunk_size_), n_chunks_per_row(n_chunks_per_row_), chunk_indices(chunk_indices_), - stream(stream_) {} + stream(stream_) + { + } - static void init(const value_idx *indptr, const value_idx *mask_row_idx, - const value_idx &n_rows, const int row_chunk_size, - rmm::device_uvector &n_chunks_per_row, - rmm::device_uvector &chunk_indices, - cudaStream_t stream) { + static void init(const value_idx* indptr, + const value_idx* mask_row_idx, + const value_idx& n_rows, + const int row_chunk_size, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { auto policy = rmm::exec_policy(stream); constexpr value_idx first_element = 0; n_chunks_per_row.set_element_async(0, first_element, stream); n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size); - thrust::transform(policy, mask_row_idx, mask_row_idx + n_rows, - n_chunks_per_row.begin() + 1, chunk_functor); + thrust::transform( + policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor); - thrust::inclusive_scan(policy, n_chunks_per_row.begin() + 1, - n_chunks_per_row.end(), - n_chunks_per_row.begin() + 1); + thrust::inclusive_scan( + policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1); - raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, - stream); + raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream); fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream); } - __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) { + __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b) + { return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]]; } - __device__ inline void get_row_offsets( - const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset, - const int &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) { - auto chunk_index = blockIdx.x / n_blocks_nnz_b; - auto chunk_val = chunk_indices[chunk_index]; - auto prev_n_chunks = n_chunks_per_row[chunk_val]; + __device__ inline void get_row_offsets(const value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + const int& n_blocks_nnz_b, + bool& first_a_chunk, + bool& last_a_chunk) + { + auto chunk_index = blockIdx.x / n_blocks_nnz_b; + auto chunk_val = chunk_indices[chunk_index]; + auto prev_n_chunks = n_chunks_per_row[chunk_val]; auto relative_chunk = chunk_index - prev_n_chunks; - first_a_chunk = relative_chunk == 0; + first_a_chunk = relative_chunk == 0; start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size; - stop_offset = start_offset + row_chunk_size; + stop_offset = start_offset + row_chunk_size; auto final_stop_offset = this->full_indptr[row_idx + 1]; last_a_chunk = stop_offset >= final_stop_offset; - stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; + stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1; } - __device__ inline void get_indices_boundary( - const value_idx *indices, value_idx &row_idx, value_idx &start_offset, - value_idx &stop_offset, value_idx &start_index, value_idx &stop_index, - bool &first_a_chunk, bool &last_a_chunk) { + __device__ inline void get_indices_boundary(const value_idx* indices, + value_idx& row_idx, + value_idx& start_offset, + value_idx& stop_offset, + value_idx& start_index, + value_idx& stop_index, + bool& first_a_chunk, + bool& last_a_chunk) + { start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1; - stop_index = last_a_chunk ? stop_index : indices[stop_offset]; + stop_index = last_a_chunk ? stop_index : indices[stop_offset]; } - __device__ inline bool check_indices_bounds(value_idx &start_index_a, - value_idx &stop_index_a, - value_idx &index_b) { + __device__ inline bool check_indices_bounds(value_idx& start_index_a, + value_idx& stop_index_a, + value_idx& index_b) + { return (index_b >= start_index_a && index_b <= stop_index_a); } @@ -160,30 +192,34 @@ class chunked_mask_row_it : public mask_row_it { struct n_chunks_per_row_functor { public: - n_chunks_per_row_functor(const value_idx *indptr_, - value_idx row_chunk_size_) - : indptr(indptr_), row_chunk_size(row_chunk_size_) {} + n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_) + : indptr(indptr_), row_chunk_size(row_chunk_size_) + { + } - __host__ __device__ value_idx operator()(const value_idx &i) { + __host__ __device__ value_idx operator()(const value_idx& i) + { auto degree = indptr[i + 1] - indptr[i]; return raft::ceildiv(degree, (value_idx)row_chunk_size); } - const value_idx *indptr; + const value_idx* indptr; value_idx row_chunk_size; }; private: - static void fill_chunk_indices( - const value_idx &n_rows, rmm::device_uvector &n_chunks_per_row, - rmm::device_uvector &chunk_indices, cudaStream_t stream) { + static void fill_chunk_indices(const value_idx& n_rows, + rmm::device_uvector& n_chunks_per_row, + rmm::device_uvector& chunk_indices, + cudaStream_t stream) + { auto n_threads = std::min(n_rows, 256); - auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); + auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads); chunk_indices.resize(total_row_blocks, stream); - fill_chunk_indices_kernel<<>>( - n_chunks_per_row.data(), chunk_indices.data(), n_rows); + fill_chunk_indices_kernel + <<>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows); } }; diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh index 79a5f154d0..5a1c152bd0 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh @@ -26,71 +26,91 @@ namespace detail { template class dense_smem_strategy : public coo_spmv_strategy { public: - using smem_type = value_t *; + using smem_type = value_t*; using insert_type = smem_type; - using find_type = smem_type; + using find_type = smem_type; - dense_smem_strategy(const distances_config_t &config_) - : coo_spmv_strategy(config_) {} + dense_smem_strategy(const distances_config_t& config_) + : coo_spmv_strategy(config_) + { + } - inline static int smem_per_block(int n_cols) { - return (n_cols * sizeof(value_t)) + - ((1024 / raft::warp_size()) * sizeof(value_t)); + inline static int smem_per_block(int n_cols) + { + return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t)); } template - void dispatch(value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, write_f write_func, - int chunk_size) { - auto n_blocks_per_row = - raft::ceildiv(this->config.b_nnz, chunk_size * 1024); - auto n_blocks = this->config.a_nrows * n_blocks_per_row; - - mask_row_it a_indptr(this->config.a_indptr, - this->config.a_nrows); - - this->_dispatch_base(*this, this->config.b_ncols, a_indptr, out_dists, - coo_rows_b, product_func, accum_func, write_func, - chunk_size, n_blocks, n_blocks_per_row); + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024); + auto n_blocks = this->config.a_nrows * n_blocks_per_row; + + mask_row_it a_indptr(this->config.a_indptr, this->config.a_nrows); + + this->_dispatch_base(*this, + this->config.b_ncols, + a_indptr, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); } template - void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size) { - auto n_blocks_per_row = - raft::ceildiv(this->config.a_nnz, chunk_size * 1024); - auto n_blocks = this->config.b_nrows * n_blocks_per_row; - - mask_row_it b_indptr(this->config.b_indptr, - this->config.b_nrows); - - this->_dispatch_base_rev(*this, this->config.a_ncols, b_indptr, out_dists, - coo_rows_a, product_func, accum_func, write_func, - chunk_size, n_blocks, n_blocks_per_row); + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { + auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024); + auto n_blocks = this->config.b_nrows * n_blocks_per_row; + + mask_row_it b_indptr(this->config.b_indptr, this->config.b_nrows); + + this->_dispatch_base_rev(*this, + this->config.a_ncols, + b_indptr, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_blocks, + n_blocks_per_row); } - __device__ inline insert_type init_insert(smem_type cache, - const value_idx &cache_size) { + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { for (int k = threadIdx.x; k < cache_size; k += blockDim.x) { cache[k] = 0.0; } return cache; } - __device__ inline void insert(insert_type cache, const value_idx &key, - const value_t &value) { + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { cache[key] = value; } - __device__ inline find_type init_find(smem_type cache, - const value_idx &cache_size) { + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { return cache; } - __device__ inline value_t find(find_type cache, const value_idx &key) { - return cache[key]; - } + __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; } }; } // namespace detail diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh index 5ba2d5c102..4f8637b425 100644 --- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh +++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh @@ -1,18 +1,18 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #pragma once @@ -39,177 +39,238 @@ template class hash_strategy : public coo_spmv_strategy { public: using insert_type = - typename cuco::static_map::device_mutable_view; - using smem_type = typename insert_type::slot_type *; + typename cuco::static_map::device_mutable_view; + using smem_type = typename insert_type::slot_type*; using find_type = - typename cuco::static_map::device_view; + typename cuco::static_map::device_view; - hash_strategy(const distances_config_t &config_, - float capacity_threshold_ = 0.5, int map_size_ = get_map_size()) + hash_strategy(const distances_config_t& config_, + float capacity_threshold_ = 0.5, + int map_size_ = get_map_size()) : coo_spmv_strategy(config_), capacity_threshold(capacity_threshold_), - map_size(map_size_) {} + map_size(map_size_) + { + } - void chunking_needed(const value_idx *indptr, const value_idx n_rows, - rmm::device_uvector &mask_indptr, - std::tuple &n_rows_divided, - cudaStream_t stream) { + void chunking_needed(const value_idx* indptr, + const value_idx n_rows, + rmm::device_uvector& mask_indptr, + std::tuple& n_rows_divided, + cudaStream_t stream) + { auto policy = this->config.handle.get_thrust_policy(); - auto less = thrust::copy_if( - policy, thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), mask_indptr.data(), - fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); + auto less = thrust::copy_if(policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + mask_indptr.data(), + fits_in_hash_table(indptr, 0, capacity_threshold * map_size)); std::get<0>(n_rows_divided) = less - mask_indptr.data(); auto more = thrust::copy_if( - policy, thrust::make_counting_iterator(value_idx(0)), - thrust::make_counting_iterator(n_rows), less, - fits_in_hash_table(indptr, capacity_threshold * map_size, - std::numeric_limits::max())); + policy, + thrust::make_counting_iterator(value_idx(0)), + thrust::make_counting_iterator(n_rows), + less, + fits_in_hash_table( + indptr, capacity_threshold * map_size, std::numeric_limits::max())); std::get<1>(n_rows_divided) = more - less; } template - void dispatch(value_t *out_dists, value_idx *coo_rows_b, - product_f product_func, accum_f accum_func, write_f write_func, - int chunk_size) { + void dispatch(value_t* out_dists, + value_idx* coo_rows_b, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr( - this->config.a_nrows, this->config.handle.get_stream()); + rmm::device_uvector mask_indptr(this->config.a_nrows, + this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.a_indptr, this->config.a_nrows, mask_indptr, - n_rows_divided, this->config.handle.get_stream()); + chunking_needed(this->config.a_indptr, + this->config.a_nrows, + mask_indptr, + n_rows_divided, + this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.a_indptr, less_rows, - mask_indptr.data()); + mask_row_it less(this->config.a_indptr, less_rows, mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base(*this, map_size, less, out_dists, coo_rows_b, - product_func, accum_func, write_func, chunk_size, - n_less_blocks, n_blocks_per_row); + this->_dispatch_base(*this, + map_size, + less, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row( - more_rows + 1, this->config.handle.get_stream()); - rmm::device_uvector chunk_indices( - 0, this->config.handle.get_stream()); - chunked_mask_row_it::init( - this->config.a_indptr, mask_indptr.data() + less_rows, more_rows, - capacity_threshold * map_size, n_chunks_per_row, chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more( - this->config.a_indptr, more_rows, mask_indptr.data() + less_rows, - capacity_threshold * map_size, n_chunks_per_row.data(), - chunk_indices.data(), this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row(more_rows + 1, + this->config.handle.get_stream()); + rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); + chunked_mask_row_it::init(this->config.a_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more(this->config.a_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base(*this, map_size, more, out_dists, coo_rows_b, - product_func, accum_func, write_func, chunk_size, - n_more_blocks, n_blocks_per_row); + this->_dispatch_base(*this, + map_size, + more, + out_dists, + coo_rows_b, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); } } template - void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a, - product_f product_func, accum_f accum_func, - write_f write_func, int chunk_size) { + void dispatch_rev(value_t* out_dists, + value_idx* coo_rows_a, + product_f product_func, + accum_f accum_func, + write_f write_func, + int chunk_size) + { auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb); - rmm::device_uvector mask_indptr( - this->config.b_nrows, this->config.handle.get_stream()); + rmm::device_uvector mask_indptr(this->config.b_nrows, + this->config.handle.get_stream()); std::tuple n_rows_divided; - chunking_needed(this->config.b_indptr, this->config.b_nrows, mask_indptr, - n_rows_divided, this->config.handle.get_stream()); + chunking_needed(this->config.b_indptr, + this->config.b_nrows, + mask_indptr, + n_rows_divided, + this->config.handle.get_stream()); auto less_rows = std::get<0>(n_rows_divided); if (less_rows > 0) { - mask_row_it less(this->config.b_indptr, less_rows, - mask_indptr.data()); + mask_row_it less(this->config.b_indptr, less_rows, mask_indptr.data()); auto n_less_blocks = less_rows * n_blocks_per_row; - this->_dispatch_base_rev(*this, map_size, less, out_dists, coo_rows_a, - product_func, accum_func, write_func, chunk_size, - n_less_blocks, n_blocks_per_row); + this->_dispatch_base_rev(*this, + map_size, + less, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_less_blocks, + n_blocks_per_row); } auto more_rows = std::get<1>(n_rows_divided); if (more_rows > 0) { - rmm::device_uvector n_chunks_per_row( - more_rows + 1, this->config.handle.get_stream()); - rmm::device_uvector chunk_indices( - 0, this->config.handle.get_stream()); - chunked_mask_row_it::init( - this->config.b_indptr, mask_indptr.data() + less_rows, more_rows, - capacity_threshold * map_size, n_chunks_per_row, chunk_indices, - this->config.handle.get_stream()); - - chunked_mask_row_it more( - this->config.b_indptr, more_rows, mask_indptr.data() + less_rows, - capacity_threshold * map_size, n_chunks_per_row.data(), - chunk_indices.data(), this->config.handle.get_stream()); + rmm::device_uvector n_chunks_per_row(more_rows + 1, + this->config.handle.get_stream()); + rmm::device_uvector chunk_indices(0, this->config.handle.get_stream()); + chunked_mask_row_it::init(this->config.b_indptr, + mask_indptr.data() + less_rows, + more_rows, + capacity_threshold * map_size, + n_chunks_per_row, + chunk_indices, + this->config.handle.get_stream()); + + chunked_mask_row_it more(this->config.b_indptr, + more_rows, + mask_indptr.data() + less_rows, + capacity_threshold * map_size, + n_chunks_per_row.data(), + chunk_indices.data(), + this->config.handle.get_stream()); auto n_more_blocks = more.total_row_blocks * n_blocks_per_row; - this->_dispatch_base_rev(*this, map_size, more, out_dists, coo_rows_a, - product_func, accum_func, write_func, chunk_size, - n_more_blocks, n_blocks_per_row); + this->_dispatch_base_rev(*this, + map_size, + more, + out_dists, + coo_rows_a, + product_func, + accum_func, + write_func, + chunk_size, + n_more_blocks, + n_blocks_per_row); } } - __device__ inline insert_type init_insert(smem_type cache, - const value_idx &cache_size) { + __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size) + { return insert_type::make_from_uninitialized_slots( cooperative_groups::this_thread_block(), cache, cache_size, -1, 0); } - __device__ inline void insert(insert_type cache, const value_idx &key, - const value_t &value) { + __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value) + { auto success = cache.insert(cuco::pair(key, value)); } - __device__ inline find_type init_find(smem_type cache, - const value_idx &cache_size) { + __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size) + { return find_type(cache, cache_size, -1, 0); } - __device__ inline value_t find(find_type cache, const value_idx &key) { + __device__ inline value_t find(find_type cache, const value_idx& key) + { auto a_pair = cache.find(key); value_t a_col = 0.0; - if (a_pair != cache.end()) { - a_col = a_pair->second; - } + if (a_pair != cache.end()) { a_col = a_pair->second; } return a_col; } struct fits_in_hash_table { public: - fits_in_hash_table(const value_idx *indptr_, value_idx degree_l_, - value_idx degree_r_) - : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) {} + fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_) + : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) + { + } - __host__ __device__ bool operator()(const value_idx &i) { + __host__ __device__ bool operator()(const value_idx& i) + { auto degree = indptr[i + 1] - indptr[i]; return degree >= degree_l && degree < degree_r; } private: - const value_idx *indptr; + const value_idx* indptr; const value_idx degree_l, degree_r; }; - inline static int get_map_size() { - return (raft::getSharedMemPerBlock() - - ((tpb / raft::warp_size()) * sizeof(value_t))) / + inline static int get_map_size() + { + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(typename insert_type::slot_type); } diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh index 2cd7b670d8..bde979a993 100644 --- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh @@ -42,35 +42,38 @@ template class ip_distances_t : public distances_t { public: /** - * Computes simple sparse inner product distances as sum(x_y * y_k) - * @param[in] config specifies inputs, outputs, and sizes - */ - ip_distances_t(const distances_config_t &config) - : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) { - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows_b.data(), config_->b_nnz, + * Computes simple sparse inner product distances as sum(x_y * y_k) + * @param[in] config specifies inputs, outputs, and sizes + */ + ip_distances_t(const distances_config_t& config) + : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) + { + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows_b.data(), + config_->b_nnz, config_->handle.get_stream()); } /** - * Performs pairwise distance computation and computes output distances - * @param out_distances dense output matrix (size a_nrows * b_nrows) - */ - void compute(value_t *out_distances) { + * Performs pairwise distance computation and computes output distances + * @param out_distances dense output matrix (size a_nrows * b_nrows) + */ + void compute(value_t* out_distances) + { /** - * Compute pairwise distances and return dense matrix in row-major format - */ + * Compute pairwise distances and return dense matrix in row-major format + */ balanced_coo_pairwise_generalized_spmv( - out_distances, *config_, coo_rows_b.data(), Product(), Sum(), - AtomicAdd()); + out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd()); } - value_idx *b_rows_coo() { return coo_rows_b.data(); } + value_idx* b_rows_coo() { return coo_rows_b.data(); } - value_t *b_data_coo() { return config_->b_data; } + value_t* b_data_coo() { return config_->b_data; } private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector coo_rows_b; }; diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh index f06a15215c..a4a534823f 100644 --- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh @@ -38,35 +38,36 @@ namespace detail { // @TODO: Move this into sparse prims (coo_norm) template -__global__ void compute_row_norm_kernel(value_t *out, - const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, - value_idx nnz) { +__global__ void compute_row_norm_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { - atomicAdd(&out[coo_rows[i]], data[i] * data[i]); - } + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); } } template -__global__ void compute_row_sum_kernel(value_t *out, - const value_idx *__restrict__ coo_rows, - const value_t *__restrict__ data, - value_idx nnz) { +__global__ void compute_row_sum_kernel(value_t* out, + const value_idx* __restrict__ coo_rows, + const value_t* __restrict__ data, + value_idx nnz) +{ value_idx i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < nnz) { - atomicAdd(&out[coo_rows[i]], data[i]); - } + if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); } } template -__global__ void compute_euclidean_warp_kernel( - value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, - const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols, - expansion_f expansion_func) { +__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + value_idx n_rows, + value_idx n_cols, + expansion_f expansion_func) +{ std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; @@ -80,25 +81,29 @@ __global__ void compute_euclidean_warp_kernel( } template -__global__ void compute_correlation_warp_kernel( - value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms, - const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms, - const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols, - value_idx n) { +__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C, + const value_t* __restrict__ Q_sq_norms, + const value_t* __restrict__ R_sq_norms, + const value_t* __restrict__ Q_norms, + const value_t* __restrict__ R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n) +{ std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x; - value_idx i = tid / n_cols; - value_idx j = tid % n_cols; + value_idx i = tid / n_cols; + value_idx j = tid % n_cols; if (i >= n_rows || j >= n_cols) return; - value_t dot = C[(size_t)i * n_cols + j]; + value_t dot = C[(size_t)i * n_cols + j]; value_t Q_l1 = Q_norms[i]; value_t R_l1 = R_norms[j]; value_t Q_l2 = Q_sq_norms[i]; value_t R_l2 = R_sq_norms[j]; - value_t numer = n * dot - (Q_l1 * R_l1); + value_t numer = n * dot - (Q_l1 * R_l1); value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1); value_t R_denom = n * R_l2 - (R_l1 * R_l1); @@ -108,56 +113,75 @@ __global__ void compute_correlation_warp_kernel( C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001); } -template -void compute_euclidean(value_t *C, const value_t *Q_sq_norms, - const value_t *R_sq_norms, value_idx n_rows, - value_idx n_cols, cudaStream_t stream, - expansion_f expansion_func) { +template +void compute_euclidean(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + value_idx n_rows, + value_idx n_cols, + cudaStream_t stream, + expansion_f expansion_func) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_euclidean_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func); } -template -void compute_l2(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, cudaStream_t stream, - expansion_f expansion_func) { +template +void compute_l2(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + cudaStream_t stream, + expansion_f expansion_func) +{ rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); compute_row_norm_kernel<<>>( R_sq_norms.data(), R_coo_rows, R_data, R_nnz); - compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, - expansion_func); + compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func); } template -void compute_correlation(value_t *C, const value_t *Q_sq_norms, - const value_t *R_sq_norms, const value_t *Q_norms, - const value_t *R_norms, value_idx n_rows, - value_idx n_cols, value_idx n, cudaStream_t stream) { +void compute_correlation(value_t* C, + const value_t* Q_sq_norms, + const value_t* R_sq_norms, + const value_t* Q_norms, + const value_t* R_norms, + value_idx n_rows, + value_idx n_cols, + value_idx n, + cudaStream_t stream) +{ int blocks = raft::ceildiv((size_t)n_rows * n_cols, tpb); compute_correlation_warp_kernel<<>>( C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n); } template -void compute_corr(value_t *out, const value_idx *Q_coo_rows, - const value_t *Q_data, value_idx Q_nnz, - const value_idx *R_coo_rows, const value_t *R_data, - value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols, - cudaStream_t stream) { +void compute_corr(value_t* out, + const value_idx* Q_coo_rows, + const value_t* Q_data, + value_idx Q_nnz, + const value_idx* R_coo_rows, + const value_t* R_data, + value_idx R_nnz, + value_idx m, + value_idx n, + value_idx n_cols, + cudaStream_t stream) +{ // sum_sq for std dev rmm::device_uvector Q_sq_norms(m, stream); rmm::device_uvector R_sq_norms(n, stream); @@ -166,15 +190,11 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, rmm::device_uvector Q_norms(m, stream); rmm::device_uvector R_norms(n, stream); - CUDA_CHECK( - cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); - CUDA_CHECK( - cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t))); + CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t))); compute_row_norm_kernel<<>>( Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz); @@ -186,8 +206,15 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, compute_row_sum_kernel<<>>( R_norms.data(), R_coo_rows, R_data, R_nnz); - compute_correlation(out, Q_sq_norms.data(), R_sq_norms.data(), Q_norms.data(), - R_norms.data(), m, n, n_cols, stream); + compute_correlation(out, + Q_sq_norms.data(), + R_sq_norms.data(), + Q_norms.data(), + R_norms.data(), + m, + n, + n_cols, + stream); } /** @@ -197,35 +224,44 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows, template class l2_expanded_distances_t : public distances_t { public: - explicit l2_expanded_distances_t( - const distances_config_t &config) - : config_(&config), ip_dists(config) {} + explicit l2_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_l2( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - return -2 * dot + q_norm + r_norm; - }); + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + return -2 * dot + q_norm + r_norm; + }); } ~l2_expanded_distances_t() = default; protected: - const distances_config_t *config_; + const distances_config_t* config_; ip_distances_t ip_dists; }; @@ -234,18 +270,21 @@ class l2_expanded_distances_t : public distances_t { * The expanded form is more efficient for sparse data. */ template -class l2_sqrt_expanded_distances_t - : public l2_expanded_distances_t { +class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t { public: - explicit l2_sqrt_expanded_distances_t( - const distances_config_t &config) - : l2_expanded_distances_t(config) {} + explicit l2_sqrt_expanded_distances_t(const distances_config_t& config) + : l2_expanded_distances_t(config) + { + } - void compute(value_t *out_dists) override { + void compute(value_t* out_dists) override + { l2_expanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -259,79 +298,96 @@ class l2_sqrt_expanded_distances_t template class correlation_expanded_distances_t : public distances_t { public: - explicit correlation_expanded_distances_t( - const distances_config_t &config) - : config_(&config), ip_dists(config) {} + explicit correlation_expanded_distances_t(const distances_config_t& config) + : config_(&config), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_corr(out_dists, search_coo_rows.data(), config_->a_data, - config_->a_nnz, b_indices, b_data, config_->b_nnz, - config_->a_nrows, config_->b_nrows, config_->b_ncols, + compute_corr(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->b_ncols, config_->handle.get_stream()); } ~correlation_expanded_distances_t() = default; protected: - const distances_config_t *config_; + const distances_config_t* config_; ip_distances_t ip_dists; }; /** - * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2))) - * The expanded form is more efficient for sparse data. + * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * + * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data. */ template class cosine_expanded_distances_t : public distances_t { public: - explicit cosine_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit cosine_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_idx *b_indices = ip_dists.b_rows_coo(); - value_t *b_data = ip_dists.b_data_coo(); + value_idx* b_indices = ip_dists.b_rows_coo(); + value_t* b_data = ip_dists.b_data_coo(); - rmm::device_uvector search_coo_rows( - config_->a_nnz, config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - search_coo_rows.data(), config_->a_nnz, + rmm::device_uvector search_coo_rows(config_->a_nnz, config_->handle.get_stream()); + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + search_coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); - compute_l2( - out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz, - b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows, - config_->handle.get_stream(), - [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { - value_t norms = sqrt(q_norm) * sqrt(r_norm); - // deal with potential for 0 in denominator by forcing 0/1 instead - value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); - - // flip the similarity when both rows are 0 - bool both_empty = (q_norm == 0) && (r_norm == 0); - return 1 - ((!both_empty * cos) + both_empty); - }); + compute_l2(out_dists, + search_coo_rows.data(), + config_->a_data, + config_->a_nnz, + b_indices, + b_data, + config_->b_nnz, + config_->a_nrows, + config_->b_nrows, + config_->handle.get_stream(), + [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) { + value_t norms = sqrt(q_norm) * sqrt(r_norm); + // deal with potential for 0 in denominator by forcing 0/1 instead + value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms); + + // flip the similarity when both rows are 0 + bool both_empty = (q_norm == 0) && (r_norm == 0); + return 1 - ((!both_empty * cos) + both_empty); + }); } ~cosine_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; @@ -348,25 +404,34 @@ class cosine_expanded_distances_t : public distances_t { template class hellinger_expanded_distances_t : public distances_t { public: - explicit hellinger_expanded_distances_t( - const distances_config_t &config) - : config_(&config), workspace(0, config.handle.get_stream()) {} + explicit hellinger_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, *config_, coo_rows.data(), - [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, Sum(), + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, + Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative bool rectifier = (1 - input) > 0; @@ -378,42 +443,43 @@ class hellinger_expanded_distances_t : public distances_t { ~hellinger_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; }; template class russelrao_expanded_distances_t : public distances_t { public: - explicit russelrao_expanded_distances_t( - const distances_config_t &config) - : config_(&config), - workspace(0, config.handle.get_stream()), - ip_dists(config) {} + explicit russelrao_expanded_distances_t(const distances_config_t& config) + : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { ip_dists.compute(out_dists); - value_t n_cols = config_->a_ncols; + value_t n_cols = config_->a_ncols; value_t n_cols_inv = 1.0 / n_cols; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; }, config_->handle.get_stream()); - auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); - auto diags = thrust::counting_iterator(0); + auto exec_policy = rmm::exec_policy(config_->handle.get_stream()); + auto diags = thrust::counting_iterator(0); value_idx b_nrows = config_->b_nrows; - thrust::for_each(exec_policy, diags, diags + config_->a_nrows, - [=] __device__(value_idx input) { - out_dists[input * b_nrows + input] = 0.0; - }); + thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) { + out_dists[input * b_nrows + input] = 0.0; + }); } ~russelrao_expanded_distances_t() = default; private: - const distances_config_t *config_; + const distances_config_t* config_; rmm::device_uvector workspace; ip_distances_t ip_dists; }; diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh index c11369375b..f5e7c75988 100644 --- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh +++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh @@ -39,23 +39,33 @@ namespace sparse { namespace distance { namespace detail { -template -void unexpanded_lp_distances( - value_t *out_dists, const distances_config_t *config_, - product_f product_func, accum_f accum_func, write_f write_func) { +template +void unexpanded_lp_distances(value_t* out_dists, + const distances_config_t* config_, + product_f product_func, + accum_f accum_func, + write_f write_func) +{ rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func); - raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows, - coo_rows.data(), config_->a_nnz, + raft::sparse::convert::csr_to_coo(config_->a_indptr, + config_->a_nrows, + coo_rows.data(), + config_->a_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv_rev( @@ -72,48 +82,51 @@ void unexpanded_lp_distances( template class l1_unexpanded_distances_t : public distances_t { public: - l1_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + l1_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), Sum(), AtomicAdd()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class l2_unexpanded_distances_t : public distances_t { public: - l2_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + l2_unexpanded_distances_t(const distances_config_t& config) : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, SqDiff(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, SqDiff(), Sum(), AtomicAdd()); } protected: - const distances_config_t *config_; + const distances_config_t* config_; }; template -class l2_sqrt_unexpanded_distances_t - : public l2_unexpanded_distances_t { +class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t { public: - l2_sqrt_unexpanded_distances_t( - const distances_config_t &config) - : l2_unexpanded_distances_t(config) {} + l2_sqrt_unexpanded_distances_t(const distances_config_t& config) + : l2_unexpanded_distances_t(config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { l2_unexpanded_distances_t::compute(out_dists); // Sqrt Post-processing raft::linalg::unaryOp( - out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows, + out_dists, + out_dists, + this->config_->a_nrows * this->config_->b_nrows, [] __device__(value_t input) { int neg = input < 0 ? -1 : 1; return sqrt(abs(input) * neg); @@ -125,29 +138,33 @@ class l2_sqrt_unexpanded_distances_t template class linf_unexpanded_distances_t : public distances_t { public: - explicit linf_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit linf_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, AbsDiff(), - Max(), AtomicMax()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, AbsDiff(), Max(), AtomicMax()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class canberra_unexpanded_distances_t : public distances_t { public: - explicit canberra_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit canberra_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { unexpanded_lp_distances( - out_dists, config_, + out_dists, + config_, [] __device__(value_t a, value_t b) { value_t d = fabs(a) + fabs(b); @@ -155,70 +172,82 @@ class canberra_unexpanded_distances_t : public distances_t { // forcing 1/0 instead return ((d != 0) * fabs(a - b)) / (d + (d == 0)); }, - Sum(), AtomicAdd()); + Sum(), + AtomicAdd()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class lp_unexpanded_distances_t : public distances_t { public: - explicit lp_unexpanded_distances_t( - const distances_config_t &config, value_t p_) - : config_(&config), p(p_) {} + explicit lp_unexpanded_distances_t(const distances_config_t& config, + value_t p_) + : config_(&config), p(p_) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, PDiff(p), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, PDiff(p), Sum(), AtomicAdd()); float one_over_p = 1.0f / p; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return pow(input, one_over_p); }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; value_t p; }; template class hamming_unexpanded_distances_t : public distances_t { public: - explicit hamming_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + explicit hamming_unexpanded_distances_t(const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { - unexpanded_lp_distances(out_dists, config_, NotEqual(), - Sum(), AtomicAdd()); + void compute(value_t* out_dists) + { + unexpanded_lp_distances(out_dists, config_, NotEqual(), Sum(), AtomicAdd()); value_t n_cols = 1.0 / config_->a_ncols; raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return input * n_cols; }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class jensen_shannon_unexpanded_distances_t : public distances_t { public: explicit jensen_shannon_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { unexpanded_lp_distances( - out_dists, config_, + out_dists, + config_, [] __device__(value_t a, value_t b) { - value_t m = 0.5f * (a + b); + value_t m = 0.5f * (a + b); bool a_zero = a == 0; bool b_zero = b == 0; @@ -228,49 +257,61 @@ class jensen_shannon_unexpanded_distances_t : public distances_t { bool x_zero = x == 0; bool y_zero = y == 0; - return (-a * (!x_zero * log(x + x_zero))) + - (-b * (!y_zero * log(y + y_zero))); + return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero))); }, - Sum(), AtomicAdd()); + Sum(), + AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return sqrt(0.5 * input); }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; template class kl_divergence_unexpanded_distances_t : public distances_t { public: explicit kl_divergence_unexpanded_distances_t( - const distances_config_t &config) - : config_(&config) {} + const distances_config_t& config) + : config_(&config) + { + } - void compute(value_t *out_dists) { + void compute(value_t* out_dists) + { rmm::device_uvector coo_rows(max(config_->b_nnz, config_->a_nnz), config_->handle.get_stream()); - raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows, - coo_rows.data(), config_->b_nnz, + raft::sparse::convert::csr_to_coo(config_->b_indptr, + config_->b_nrows, + coo_rows.data(), + config_->b_nnz, config_->handle.get_stream()); balanced_coo_pairwise_generalized_spmv( - out_dists, *config_, coo_rows.data(), - [] __device__(value_t a, value_t b) { return a * log(a / b); }, Sum(), + out_dists, + *config_, + coo_rows.data(), + [] __device__(value_t a, value_t b) { return a * log(a / b); }, + Sum(), AtomicAdd()); raft::linalg::unaryOp( - out_dists, out_dists, config_->a_nrows * config_->b_nrows, + out_dists, + out_dists, + config_->a_nrows * config_->b_nrows, [=] __device__(value_t input) { return 0.5 * input; }, config_->handle.get_stream()); } private: - const distances_config_t *config_; + const distances_config_t* config_; }; }; // END namespace detail diff --git a/cpp/include/raft/sparse/distance/detail/operators.cuh b/cpp/include/raft/sparse/distance/detail/operators.cuh index 9f206095bf..b2c2e2172b 100644 --- a/cpp/include/raft/sparse/distance/detail/operators.cuh +++ b/cpp/include/raft/sparse/distance/detail/operators.cuh @@ -25,21 +25,24 @@ namespace detail { struct Sum { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a + b; } }; struct NotEqual { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a != b; } }; struct SqDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return (a - b) * (a - b); } }; @@ -50,44 +53,48 @@ struct PDiff { PDiff(float p_) : p(p_) {} template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return pow(a - b, p); } }; struct Max { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return fmax(a, b); } }; struct AtomicAdd { template - __host__ __device__ __forceinline__ value_t operator()(value_t *a, - value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) + { return atomicAdd(a, b); } }; struct AtomicMax { template - __host__ __device__ __forceinline__ value_t operator()(value_t *a, - value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b) + { return atomicMax(a, b); } }; struct Product { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return a * b; } }; struct AbsDiff { template - __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) { + __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) + { return fabs(a - b); } }; diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh index abfb7d24ea..8c01b33c1e 100644 --- a/cpp/include/raft/sparse/distance/detail/utils.cuh +++ b/cpp/include/raft/sparse/distance/detail/utils.cuh @@ -33,10 +33,10 @@ namespace detail { * @return the maximum number of columns that can be stored in smem */ template -inline int max_cols_per_block() { +inline int max_cols_per_block() +{ // max cols = (total smem available - cub reduction smem) - return (raft::getSharedMemPerBlock() - - ((tpb / raft::warp_size()) * sizeof(value_t))) / + return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) / sizeof(value_t); } diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp index 0aeabe5019..92c08654d2 100644 --- a/cpp/include/raft/sparse/distance/distance.hpp +++ b/cpp/include/raft/sparse/distance/distance.hpp @@ -71,90 +71,71 @@ static const std::unordered_set supportedDistance{ * @param[out] out dense output array (size A.nrows * B.nrows) * @param[in] input_config input argument configuration * @param[in] metric distance metric to use -* @param[in] metric_arg metric argument (used for Minkowski distance) + * @param[in] metric_arg metric argument (used for Minkowski distance) */ template -void pairwiseDistance(value_t *out, +void pairwiseDistance(value_t* out, distances_config_t input_config, - raft::distance::DistanceType metric, float metric_arg) { + raft::distance::DistanceType metric, + float metric_arg) +{ switch (metric) { case raft::distance::DistanceType::L2Expanded: - detail::l2_expanded_distances_t(input_config) - .compute(out); + detail::l2_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtExpanded: - detail::l2_sqrt_expanded_distances_t(input_config) - .compute(out); + detail::l2_sqrt_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::InnerProduct: detail::ip_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2Unexpanded: - detail::l2_unexpanded_distances_t(input_config) - .compute(out); + detail::l2_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L2SqrtUnexpanded: - detail::l2_sqrt_unexpanded_distances_t(input_config) - .compute(out); + detail::l2_sqrt_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::L1: - detail::l1_unexpanded_distances_t(input_config) - .compute(out); + detail::l1_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::LpUnexpanded: - detail::lp_unexpanded_distances_t(input_config, - metric_arg) - .compute(out); + detail::lp_unexpanded_distances_t(input_config, metric_arg).compute(out); break; case raft::distance::DistanceType::Linf: - detail::linf_unexpanded_distances_t(input_config) - .compute(out); + detail::linf_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::Canberra: - detail::canberra_unexpanded_distances_t(input_config) - .compute(out); + detail::canberra_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::JaccardExpanded: - detail::jaccard_expanded_distances_t(input_config) - .compute(out); + detail::jaccard_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::CosineExpanded: - detail::cosine_expanded_distances_t(input_config) - .compute(out); + detail::cosine_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::HellingerExpanded: - detail::hellinger_expanded_distances_t(input_config) - .compute(out); + detail::hellinger_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::DiceExpanded: - detail::dice_expanded_distances_t(input_config) - .compute(out); + detail::dice_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::CorrelationExpanded: - detail::correlation_expanded_distances_t(input_config) - .compute(out); + detail::correlation_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::RusselRaoExpanded: - detail::russelrao_expanded_distances_t(input_config) - .compute(out); + detail::russelrao_expanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::HammingUnexpanded: - detail::hamming_unexpanded_distances_t(input_config) - .compute(out); + detail::hamming_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::JensenShannon: - detail::jensen_shannon_unexpanded_distances_t( - input_config) - .compute(out); + detail::jensen_shannon_unexpanded_distances_t(input_config).compute(out); break; case raft::distance::DistanceType::KLDivergence: - detail::kl_divergence_unexpanded_distances_t( - input_config) - .compute(out); + detail::kl_divergence_unexpanded_distances_t(input_config).compute(out); break; - default: - THROW("Unsupported distance: %d", metric); + default: THROW("Unsupported distance: %d", metric); } } diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h index 29f541498b..1738dd7498 100644 --- a/cpp/include/raft/sparse/hierarchy/common.h +++ b/cpp/include/raft/sparse/hierarchy/common.h @@ -37,13 +37,15 @@ class linkage_output { value_idx n_leaves; value_idx n_connected_components; - value_idx *labels; // size: m + value_idx* labels; // size: m - value_idx *children; // size: (m-1, 2) + value_idx* children; // size: (m-1, 2) }; -class linkage_output_int_float : public linkage_output {}; -class linkage_output__int64_float : public linkage_output {}; +class linkage_output_int_float : public linkage_output { +}; +class linkage_output__int64_float : public linkage_output { +}; }; // namespace hierarchy }; // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh index 4ef2ac43e2..207cca7287 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh @@ -44,31 +44,32 @@ class UnionFind { value_idx n_indices; UnionFind(value_idx N_) - : n_indices(2 * N_ - 1), - parent(2 * N_ - 1, -1), - size(2 * N_ - 1, 1), - next_label(N_) { + : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_) + { memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx)); } - value_idx find(value_idx n) { + value_idx find(value_idx n) + { value_idx p; p = n; - while (parent[n] != -1) n = parent[n]; + while (parent[n] != -1) + n = parent[n]; // path compression while (parent[p] != n) { - p = parent[p == -1 ? n_indices - 1 : p]; + p = parent[p == -1 ? n_indices - 1 : p]; parent[p == -1 ? n_indices - 1 : p] = n; } return n; } - void perform_union(value_idx m, value_idx n) { + void perform_union(value_idx m, value_idx n) + { size[next_label] = size[m] + size[n]; - parent[m] = next_label; - parent[n] = next_label; + parent[m] = next_label; + parent[n] = next_label; next_label += 1; } @@ -97,10 +98,15 @@ class UnionFind { * @param[out] out_size cluster sizes of output */ template -void build_dendrogram_host(const handle_t &handle, const value_idx *rows, - const value_idx *cols, const value_t *data, - size_t nnz, value_idx *children, value_t *out_delta, - value_idx *out_size) { +void build_dendrogram_host(const handle_t& handle, + const value_idx* rows, + const value_idx* cols, + const value_t* data, + size_t nnz, + value_idx* children, + value_t* out_delta, + value_idx* out_size) +{ auto stream = handle.get_stream(); value_idx n_edges = nnz; @@ -122,8 +128,8 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, UnionFind U(nnz + 1); for (std::size_t i = 0; i < nnz; i++) { - value_idx a = mst_src_h[i]; - value_idx b = mst_dst_h[i]; + value_idx a = mst_src_h[i]; + value_idx b = mst_dst_h[i]; value_t delta = mst_weights_h[i]; value_idx aa = U.find(a); @@ -131,10 +137,10 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, value_idx children_idx = i * 2; - children_h[children_idx] = aa; + children_h[children_idx] = aa; children_h[children_idx + 1] = bb; - out_delta_h[i] = delta; - out_size_h[i] = U.size[aa] + U.size[bb]; + out_delta_h[i] = delta; + out_size_h[i] = U.size[aa] + U.size[bb]; U.perform_union(aa, bb); } @@ -145,13 +151,15 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows, } template -__global__ void write_levels_kernel(const value_idx *children, - value_idx *parents, value_idx n_vertices) { +__global__ void write_levels_kernel(const value_idx* children, + value_idx* parents, + value_idx n_vertices) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { value_idx level = tid / 2; value_idx child = children[tid]; - parents[child] = level; + parents[child] = level; } } @@ -167,14 +175,17 @@ __global__ void write_levels_kernel(const value_idx *children, * @param labels */ template -__global__ void inherit_labels(const value_idx *children, - const value_idx *levels, std::size_t n_leaves, - value_idx *labels, int cut_level, - value_idx n_vertices) { +__global__ void inherit_labels(const value_idx* children, + const value_idx* levels, + std::size_t n_leaves, + value_idx* labels, + int cut_level, + value_idx n_vertices) +{ value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < n_vertices) { - value_idx node = children[tid]; + value_idx node = children[tid]; value_idx cur_level = tid / 2; /** @@ -184,12 +195,12 @@ __global__ void inherit_labels(const value_idx *children, if (cur_level > cut_level) return; value_idx cur_parent = node; - value_idx label = labels[cur_parent]; + value_idx label = labels[cur_parent]; while (label == -1) { cur_parent = cur_level + n_leaves; - cur_level = levels[cur_parent]; - label = labels[cur_parent]; + cur_level = levels[cur_parent]; + label = labels[cur_parent]; } labels[node] = label; @@ -198,15 +209,16 @@ __global__ void inherit_labels(const value_idx *children, template struct init_label_roots { - init_label_roots(value_idx *labels_) : labels(labels_) {} + init_label_roots(value_idx* labels_) : labels(labels_) {} template - __host__ __device__ void operator()(Tuple t) { + __host__ __device__ void operator()(Tuple t) + { labels[thrust::get<1>(t)] = thrust::get<0>(t); } private: - value_idx *labels; + value_idx* labels; }; /** @@ -222,10 +234,13 @@ struct init_label_roots { * @param n_leaves */ template -void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, - const value_idx *children, size_t n_clusters, - size_t n_leaves) { - auto stream = handle.get_stream(); +void extract_flattened_clusters(const raft::handle_t& handle, + value_idx* labels, + const value_idx* children, + size_t n_clusters, + size_t n_leaves) +{ + auto stream = handle.get_stream(); auto thrust_policy = handle.get_thrust_policy(); // Handle special case where n_clusters == 1 @@ -243,24 +258,21 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, auto n_edges = (n_leaves - 1) * 2; - thrust::device_ptr d_ptr = - thrust::device_pointer_cast(children); - value_idx n_vertices = - *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; + thrust::device_ptr d_ptr = thrust::device_pointer_cast(children); + value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; // Prevent potential infinite loop from labeling disconnected // connectivities graph. RAFT_EXPECTS(n_leaves > 0, "n_leaves must be positive"); - RAFT_EXPECTS(static_cast(n_vertices) == - static_cast((n_leaves - 1) * 2), - "Multiple components found in MST or MST is invalid. " - "Cannot find single-linkage solution."); + RAFT_EXPECTS( + static_cast(n_vertices) == static_cast((n_leaves - 1) * 2), + "Multiple components found in MST or MST is invalid. " + "Cannot find single-linkage solution."); rmm::device_uvector levels(n_vertices, stream); value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb); - write_levels_kernel<<>>(children, levels.data(), - n_vertices); + write_levels_kernel<<>>(children, levels.data(), n_vertices); /** * Step 1: Find label roots: * @@ -274,27 +286,26 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, rmm::device_uvector label_roots(child_size, stream); value_idx children_cpy_start = n_edges - child_size; - raft::copy_async(label_roots.data(), children + children_cpy_start, - child_size, stream); + raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream); - thrust::sort(thrust_policy, label_roots.data(), + thrust::sort(thrust_policy, + label_roots.data(), label_roots.data() + (child_size), thrust::greater()); rmm::device_uvector tmp_labels(n_vertices, stream); // Init labels to -1 - thrust::fill(thrust_policy, tmp_labels.data(), - tmp_labels.data() + n_vertices, -1); + thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1); // Write labels for cluster roots to "labels" thrust::counting_iterator first(0); - auto z_iter = thrust::make_zip_iterator(thrust::make_tuple( - first, label_roots.data() + (label_roots.size() - n_clusters))); + auto z_iter = thrust::make_zip_iterator( + thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters))); - thrust::for_each(thrust_policy, z_iter, z_iter + n_clusters, - init_label_roots(tmp_labels.data())); + thrust::for_each( + thrust_policy, z_iter, z_iter + n_clusters, init_label_roots(tmp_labels.data())); /** * Step 2: Propagate labels by having children iterate through their parents @@ -304,9 +315,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels, */ value_idx cut_level = (n_edges / 2) - (n_clusters - 1); - inherit_labels<<>>(children, levels.data(), - n_leaves, tmp_labels.data(), - cut_level, n_vertices); + inherit_labels<<>>( + children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices); // copy tmp labels to actual labels raft::copy_async(labels, tmp_labels.data(), n_leaves, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh index 31e4a0f263..c06c24e100 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh @@ -36,14 +36,17 @@ namespace raft { namespace hierarchy { namespace detail { -template +template struct distance_graph_impl { - void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, + void run(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c); + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c); }; /** @@ -52,37 +55,41 @@ struct distance_graph_impl { * @tparam value_t */ template -struct distance_graph_impl { - void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n, +struct distance_graph_impl { + void run(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c) { - auto stream = handle.get_stream(); + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c) + { + auto stream = handle.get_stream(); auto thrust_policy = handle.get_thrust_policy(); // Need to symmetrize knn into undirected graph raft::sparse::COO knn_graph_coo(stream); - raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, - c); + raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); indices.resize(knn_graph_coo.nnz, stream); data.resize(knn_graph_coo.nnz, stream); // self-loops get max distance - auto transform_in = thrust::make_zip_iterator(thrust::make_tuple( - knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); - - thrust::transform( - thrust_policy, transform_in, transform_in + knn_graph_coo.nnz, - knn_graph_coo.vals(), - [=] __device__(const thrust::tuple &tup) { - bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); - return (self_loop * std::numeric_limits::max()) + - (!self_loop * thrust::get<2>(tup)); - }); + auto transform_in = thrust::make_zip_iterator( + thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); + + thrust::transform(thrust_policy, + transform_in, + transform_in + knn_graph_coo.nnz, + knn_graph_coo.vals(), + [=] __device__(const thrust::tuple& tup) { + bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); + return (self_loop * std::numeric_limits::max()) + + (!self_loop * thrust::get<2>(tup)); + }); raft::sparse::convert::sorted_coo_to_csr( knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream); @@ -90,10 +97,8 @@ struct distance_graph_impl -void get_distance_graph(const raft::handle_t &handle, const value_t *X, - size_t m, size_t n, raft::distance::DistanceType metric, - rmm::device_uvector &indptr, - rmm::device_uvector &indices, - rmm::device_uvector &data, int c) { +template +void get_distance_graph(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + rmm::device_uvector& indptr, + rmm::device_uvector& indices, + rmm::device_uvector& data, + int c) +{ auto stream = handle.get_stream(); indptr.resize(m + 1, stream); diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh index 6ef6f9879b..0c0b049f11 100644 --- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh +++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh @@ -34,9 +34,10 @@ namespace hierarchy { namespace detail { template -void merge_msts(raft::Graph_COO &coo1, - raft::Graph_COO &coo2, - cudaStream_t stream) { +void merge_msts(raft::Graph_COO& coo1, + raft::Graph_COO& coo2, + cudaStream_t stream) +{ /** Add edges to existing mst **/ int final_nnz = coo2.n_edges + coo1.n_edges; @@ -47,12 +48,9 @@ void merge_msts(raft::Graph_COO &coo1, /** * Construct final edge list */ - raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), - coo2.n_edges, stream); - raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), - coo2.n_edges, stream); - raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), - coo2.n_edges, stream); + raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream); + raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream); + raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream); coo1.n_edges = final_nnz; } @@ -71,12 +69,16 @@ void merge_msts(raft::Graph_COO &coo1, * @return updated MST edge list */ template -void connect_knn_graph(const raft::handle_t &handle, const value_t *X, - raft::Graph_COO &msf, - size_t m, size_t n, value_idx *color, - red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded) { +void connect_knn_graph( + const raft::handle_t& handle, + const value_t* X, + raft::Graph_COO& msf, + size_t m, + size_t n, + value_idx* color, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) +{ auto stream = handle.get_stream(); raft::sparse::COO connected_edges(stream); @@ -90,9 +92,16 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, // On the second call, we hand the MST the original colors // and the new set of edges and let it restart the optimization process - auto new_mst = raft::mst::mst( - handle, indptr2.data(), connected_edges.cols(), connected_edges.vals(), m, - connected_edges.nnz, color, stream, false, false); + auto new_mst = raft::mst::mst(handle, + indptr2.data(), + connected_edges.cols(), + connected_edges.vals(), + m, + connected_edges.nnz, + color, + stream, + false, + false); merge_msts(msf, new_mst, stream); } @@ -122,28 +131,34 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X, * argument is really just a safeguard against the potential for infinite loops. */ template -void build_sorted_mst(const raft::handle_t &handle, const value_t *X, - const value_idx *indptr, const value_idx *indices, - const value_t *pw_dists, size_t m, size_t n, - value_idx *mst_src, value_idx *mst_dst, - value_t *mst_weight, value_idx *color, size_t nnz, - red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded, - int max_iter = 10) { +void build_sorted_mst( + const raft::handle_t& handle, + const value_t* X, + const value_idx* indptr, + const value_idx* indices, + const value_t* pw_dists, + size_t m, + size_t n, + value_idx* mst_src, + value_idx* mst_dst, + value_t* mst_weight, + value_idx* color, + size_t nnz, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded, + int max_iter = 10) +{ auto stream = handle.get_stream(); // We want to have MST initialize colors on first call. auto mst_coo = raft::mst::mst( - handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, - true); + handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true); - int iters = 1; + int iters = 1; int n_components = linkage::get_n_components(color, m, stream); while (n_components > 1 && iters < max_iter) { - connect_knn_graph(handle, X, mst_coo, m, n, color, - reduction_op); + connect_knn_graph(handle, X, mst_coo, m, n, color, reduction_op); iters++; @@ -170,9 +185,8 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X, " or increase 'max_iter'", max_iter); - raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(), - mst_coo.weights.data(), mst_coo.n_edges, - stream); + raft::sparse::op::coo_sort_by_weight( + mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream); raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream); raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream); diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp index 06fffb8aed..3b6f1347ab 100644 --- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp +++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp @@ -44,18 +44,24 @@ static const size_t EMPTY = 0; * @param[in] n number of columns in X * @param[in] metric distance metrix to use when constructing connectivities graph * @param[out] out struct containing output dendrogram and cluster assignments - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control + * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect + control * of k. The algorithm will set `k = log(n) + c` * @param[in] n_clusters number of clusters to assign data samples */ -template -void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, - size_t n, raft::distance::DistanceType metric, - linkage_output *out, int c, - size_t n_clusters) { - ASSERT(n_clusters <= m, - "n_clusters must be less than or equal to the number of data points"); +void single_linkage(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + linkage_output* out, + int c, + size_t n_clusters) +{ + ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points"); auto stream = handle.get_stream(); @@ -78,10 +84,20 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, */ rmm::device_uvector color(m, stream); raft::linkage::FixConnectivitiesRedOp op(color.data(), m); - detail::build_sorted_mst( - handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n, - mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(), - indices.size(), op, metric); + detail::build_sorted_mst(handle, + X, + indptr.data(), + indices.data(), + pw_dists.data(), + m, + n, + mst_rows.data(), + mst_cols.data(), + mst_data.data(), + color.data(), + indices.size(), + op, + metric); pw_dists.release(); @@ -93,15 +109,19 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m, rmm::device_uvector out_delta(n_edges, stream); rmm::device_uvector out_size(n_edges, stream); // Create dendrogram - detail::build_dendrogram_host( - handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges, - out->children, out_delta.data(), out_size.data()); - detail::extract_flattened_clusters(handle, out->labels, out->children, - n_clusters, m); - - out->m = m; - out->n_clusters = n_clusters; - out->n_leaves = m; + detail::build_dendrogram_host(handle, + mst_rows.data(), + mst_cols.data(), + mst_data.data(), + n_edges, + out->children, + out_delta.data(), + out_size.data()); + detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m); + + out->m = m; + out->n_clusters = n_clusters; + out->n_leaves = m; out->n_connected_components = 1; } diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh index 7ed627b9e2..0c17d55762 100644 --- a/cpp/include/raft/sparse/linalg/add.cuh +++ b/cpp/include/raft/sparse/linalg/add.cuh @@ -40,40 +40,47 @@ namespace sparse { namespace linalg { template -__global__ void csr_add_calc_row_counts_kernel( - const int *a_ind, const int *a_indptr, const T *a_val, int nnz1, - const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m, - int *out_rowcounts) { +__global__ void csr_add_calc_row_counts_kernel(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_rowcounts) +{ // loop through columns in each set of rows and // calculate number of unique cols across both rows int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { int a_start_idx = a_ind[row]; - int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); + int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); /** - * Union of columns within each row of A and B so that we can scan through - * them, adding their values together. - */ + * Union of columns within each row of A and B so that we can scan through + * them, adding their values together. + */ int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx); - int *arr = new int[max_size]; + int* arr = new int[max_size]; int cur_arr_idx = 0; for (int j = a_start_idx; j < a_stop_idx; j++) { arr[cur_arr_idx] = a_indptr[j]; cur_arr_idx++; } - int arr_size = cur_arr_idx; + int arr_size = cur_arr_idx; int final_size = arr_size; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = 0; k < arr_size; k++) { if (arr[k] == cur_col) { found = true; @@ -81,9 +88,7 @@ __global__ void csr_add_calc_row_counts_kernel( } } - if (!found) { - final_size++; - } + if (!found) { final_size++; } } out_rowcounts[row] = final_size; @@ -94,11 +99,19 @@ __global__ void csr_add_calc_row_counts_kernel( } template -__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, - const T *a_val, int nnz1, const int *b_ind, - const int *b_indptr, const T *b_val, int nnz2, - int m, int *out_ind, int *out_indptr, - T *out_val) { +__global__ void csr_add_kernel(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_ind, + int* out_indptr, + T* out_val) +{ // 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -109,21 +122,21 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); + int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); int o_idx = out_ind[row]; int cur_o_idx = o_idx; for (int j = a_start_idx; j < a_stop_idx; j++) { out_indptr[cur_o_idx] = a_indptr[j]; - out_val[cur_o_idx] = a_val[j]; + out_val[cur_o_idx] = a_val[j]; cur_o_idx++; } int arr_size = cur_o_idx - o_idx; for (int j = b_start_idx; j < b_stop_idx; j++) { int cur_col = b_indptr[j]; - bool found = false; + bool found = false; for (int k = o_idx; k < o_idx + arr_size; k++) { // If we found a match, sum the two values if (out_indptr[k] == cur_col) { @@ -136,7 +149,7 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, // if we didn't find a match, add the value for b if (!found) { out_indptr[o_idx + arr_size] = cur_col; - out_val[o_idx + arr_size] = b_val[j]; + out_val[o_idx + arr_size] = b_val[j]; arr_size++; } } @@ -159,31 +172,35 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr, * @param stream: cuda stream to use */ template -size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, - int nnz1, const int *b_ind, const int *b_indptr, - const T *b_val, int nnz2, int m, int *out_ind, - cudaStream_t stream) { +size_t csr_add_calc_inds(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* out_ind, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); rmm::device_uvector row_counts(m + 1, stream); - CUDA_CHECK( - cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream)); - csr_add_calc_row_counts_kernel - <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, - b_val, nnz2, m, row_counts.data()); + csr_add_calc_row_counts_kernel<<>>( + a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data()); int cnnz = 0; raft::update_host(&cnnz, row_counts.data() + m, 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = - thrust::device_pointer_cast(row_counts.data()); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); - exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, - c_ind_d); + thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts.data()); + thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); + exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d); return cnnz; } @@ -206,16 +223,25 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val, * @param stream: cuda stream to use */ template -void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val, - int nnz1, const int *b_ind, const int *b_indptr, - const T *b_val, int nnz2, int m, int *c_ind, - int *c_indptr, T *c_val, cudaStream_t stream) { +void csr_add_finalize(const int* a_ind, + const int* a_indptr, + const T* a_val, + int nnz1, + const int* b_ind, + const int* b_indptr, + const T* b_val, + int nnz2, + int m, + int* c_ind, + int* c_indptr, + T* c_val, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_add_kernel - <<>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, - b_val, nnz2, m, c_ind, c_indptr, c_val); + csr_add_kernel<<>>( + a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh index ef6a067c39..052f674325 100644 --- a/cpp/include/raft/sparse/linalg/degree.cuh +++ b/cpp/include/raft/sparse/linalg/degree.cuh @@ -44,11 +44,10 @@ namespace linalg { * @param results array to place results */ template -__global__ void coo_degree_kernel(const T *rows, int nnz, T *results) { +__global__ void coo_degree_kernel(const T* rows, int nnz, T* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz) { - atomicAdd(results + rows[row], (T)1); - } + if (row < nnz) { atomicAdd(results + rows[row], (T)1); } } /** @@ -60,7 +59,8 @@ __global__ void coo_degree_kernel(const T *rows, int nnz, T *results) { * @param stream: cuda stream to use */ template -void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) { +void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -77,31 +77,28 @@ void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) { * @param stream: cuda stream to use */ template -void coo_degree(COO *in, int *results, cudaStream_t stream) { +void coo_degree(COO* in, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_kernel - <<>>(in->rows(), in->nnz, results); + coo_degree_kernel<<>>(in->rows(), in->nnz, results); CUDA_CHECK(cudaGetLastError()); } template -__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz, - int *results) { +__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != 0.0) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); } } template -__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, - int nnz, T scalar, int *results) { +__global__ void coo_degree_scalar_kernel( + const int* rows, const T* vals, int nnz, T scalar, int* results) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; - if (row < nnz && vals[row] != scalar) { - raft::myAtomicAdd(results + rows[row], 1); - } + if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); } } /** @@ -114,12 +111,12 @@ __global__ void coo_degree_scalar_kernel(const int *rows, const T *vals, * @param stream: cuda stream to use */ template -void coo_degree_scalar(COO *in, T scalar, int *results, - cudaStream_t stream) { +void coo_degree_scalar(COO* in, T scalar, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_scalar_kernel<<>>( - in->rows(), in->vals(), in->nnz, scalar, results); + coo_degree_scalar_kernel + <<>>(in->rows(), in->vals(), in->nnz, scalar, results); CUDA_CHECK(cudaGetLastError()); } @@ -135,8 +132,9 @@ void coo_degree_scalar(COO *in, T scalar, int *results, * @param stream: cuda stream to use */ template -void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, - int *results, cudaStream_t stream = 0) { +void coo_degree_scalar( + const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); coo_degree_scalar_kernel @@ -154,12 +152,11 @@ void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar, * @param stream: cuda stream to use */ template -void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, - cudaStream_t stream) { +void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); - coo_degree_nz_kernel - <<>>(rows, vals, nnz, results); + coo_degree_nz_kernel<<>>(rows, vals, nnz, results); } /** @@ -171,7 +168,8 @@ void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results, * @param stream: cuda stream to use */ template -void coo_degree_nz(COO *in, int *results, cudaStream_t stream) { +void coo_degree_nz(COO* in, int* results, cudaStream_t stream) +{ dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh index bfcd3fd592..59dc5ff3e4 100644 --- a/cpp/include/raft/sparse/linalg/norm.cuh +++ b/cpp/include/raft/sparse/linalg/norm.cuh @@ -41,10 +41,12 @@ __global__ void csr_row_normalize_l1_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int *ia, // csr row ex_scan (sorted by row) - const T *vals, int nnz, // array of values and number of non-zeros - int m, // num rows in csr - T *result) { // output array + const int* ia, // csr row ex_scan (sorted by row) + const T* vals, + int nnz, // array of values and number of non-zeros + int m, // num rows in csr + T* result) +{ // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -52,7 +54,7 @@ __global__ void csr_row_normalize_l1_kernel( // sum all vals_arr for row and divide each val by sum if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -65,7 +67,7 @@ __global__ void csr_row_normalize_l1_kernel( for (int j = start_idx; j < stop_idx; j++) { if (sum != 0.0) { - T val = vals[j]; + T val = vals[j]; result[j] = val / sum; } else { result[j] = 0.0; @@ -85,18 +87,18 @@ __global__ void csr_row_normalize_l1_kernel( * @param stream: cuda stream to use */ template -void csr_row_normalize_l1(const int *ia, // csr row ex_scan (sorted by row) - const T *vals, +void csr_row_normalize_l1(const int* ia, // csr row ex_scan (sorted by row) + const T* vals, int nnz, // array of values and number of non-zeros int m, // num rows in csr - T *result, - cudaStream_t stream) { // output array + T* result, + cudaStream_t stream) +{ // output array dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_l1_kernel - <<>>(ia, vals, nnz, m, result); + csr_row_normalize_l1_kernel<<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } @@ -105,10 +107,12 @@ __global__ void csr_row_normalize_max_kernel( // @TODO: This can be done much more parallel by // having threads in a warp compute the sum in parallel // over each row and then divide the values in parallel. - const int *ia, // csr row ind array (sorted by row) - const T *vals, int nnz, // array of values and number of non-zeros - int m, // num total rows in csr - T *result) { // output array + const int* ia, // csr row ind array (sorted by row) + const T* vals, + int nnz, // array of values and number of non-zeros + int m, // num total rows in csr + T* result) +{ // output array // row-based matrix 1 thread per row int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -116,7 +120,7 @@ __global__ void csr_row_normalize_max_kernel( // find max across columns and divide if (row < m) { int start_idx = ia[row]; - int stop_idx = 0; + int stop_idx = 0; if (row < m - 1) { stop_idx = ia[row + 1]; } else @@ -130,7 +134,7 @@ __global__ void csr_row_normalize_max_kernel( // divide nonzeros in current row by max for (int j = start_idx; j < stop_idx; j++) { if (max != 0.0 && max > std::numeric_limits::min()) { - T val = vals[j]; + T val = vals[j]; result[j] = val / max; } else { result[j] = 0.0; @@ -151,16 +155,17 @@ __global__ void csr_row_normalize_max_kernel( */ template -void csr_row_normalize_max(const int *ia, // csr row ind array (sorted by row) - const T *vals, +void csr_row_normalize_max(const int* ia, // csr row ind array (sorted by row) + const T* vals, int nnz, // array of values and number of non-zeros int m, // num total rows in csr - T *result, cudaStream_t stream) { + T* result, + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(m, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_normalize_max_kernel - <<>>(ia, vals, nnz, m, result); + csr_row_normalize_max_kernel<<>>(ia, vals, nnz, m, result); CUDA_CHECK(cudaGetLastError()); } diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh index ce0c4bbe6f..a293e359c2 100644 --- a/cpp/include/raft/sparse/linalg/spectral.cuh +++ b/cpp/include/raft/sparse/linalg/spectral.cuh @@ -30,15 +30,22 @@ namespace sparse { namespace spectral { template -void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, - int nnz, int n, int n_components, T *out, - unsigned long long seed = 1234567) { +void fit_embedding(const raft::handle_t& handle, + int* rows, + int* cols, + T* vals, + int nnz, + int n, + int n_components, + T* out, + unsigned long long seed = 1234567) +{ auto stream = handle.get_stream(); rmm::device_uvector src_offsets(n + 1, stream); rmm::device_uvector dst_cols(nnz, stream); rmm::device_uvector dst_vals(nnz, stream); - convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(), - dst_cols.data(), dst_vals.data()); + convert::coo_to_csr( + handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); rmm::device_uvector eigVals(n_components + 1, stream); rmm::device_uvector eigVecs(n * (n_components + 1), stream); @@ -52,45 +59,49 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals, using index_type = int; using value_type = T; - index_type *ro = src_offsets.data(); - index_type *ci = dst_cols.data(); - value_type *vs = dst_vals.data(); + index_type* ro = src_offsets.data(); + index_type* ci = dst_cols.data(); + value_type* vs = dst_vals.data(); - raft::matrix::sparse_matrix_t const r_csr_m{ - handle, ro, ci, vs, n, nnz}; + raft::matrix::sparse_matrix_t const r_csr_m{handle, ro, ci, vs, n, nnz}; - index_type neigvs = n_components + 1; - index_type maxiter = 4000; //default reset value (when set to 0); - value_type tol = 0.01; - index_type restart_iter = 15 + neigvs; //what cugraph is using + index_type neigvs = n_components + 1; + index_type maxiter = 4000; // default reset value (when set to 0); + value_type tol = 0.01; + index_type restart_iter = 15 + neigvs; // what cugraph is using - raft::eigen_solver_config_t cfg{neigvs, maxiter, - restart_iter, tol}; + raft::eigen_solver_config_t cfg{neigvs, maxiter, restart_iter, tol}; cfg.seed = seed; raft::lanczos_solver_t eig_solver{cfg}; - //cluster computation here is irrelevant, - //hence define a no-op such solver to - //feed partition(): + // cluster computation here is irrelevant, + // hence define a no-op such solver to + // feed partition(): // struct no_op_cluster_solver_t { using index_type_t = index_type; - using size_type_t = index_type; + using size_type_t = index_type; using value_type_t = value_type; - std::pair solve( - handle_t const &handle, size_type_t n_obs_vecs, size_type_t dim, - value_type_t const *__restrict__ obs, - index_type_t *__restrict__ codes) const { + std::pair solve(handle_t const& handle, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { return std::make_pair(0, 0); } }; - raft::spectral::partition(handle, r_csr_m, eig_solver, - no_op_cluster_solver_t{}, labels.data(), - eigVals.data(), eigVecs.data()); + raft::spectral::partition(handle, + r_csr_m, + eig_solver, + no_op_cluster_solver_t{}, + labels.data(), + eigVals.data(), + eigVecs.data()); raft::copy(out, eigVecs.data() + n, n * n_components, stream); diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh index a6e1027288..ae89e7993c 100644 --- a/cpp/include/raft/sparse/linalg/symmetrize.cuh +++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh @@ -47,26 +47,34 @@ namespace linalg { // TODO: value_idx param needs to be used for this once FAISS is updated to use float32 // for indices so that the index types can be uniform template -__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, - T *vals, int *orows, int *ocols, T *ovals, - int n, int cnnz, Lambda reduction_op) { +__global__ void coo_symmetrize_kernel(int* row_ind, + int* rows, + int* cols, + T* vals, + int* orows, + int* ocols, + T* ovals, + int n, + int cnnz, + Lambda reduction_op) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < n) { int start_idx = row_ind[row]; // each thread processes one row - int stop_idx = get_stop_idx(row, n, cnnz, row_ind); + int stop_idx = get_stop_idx(row, n, cnnz, row_ind); - int row_nnz = 0; + int row_nnz = 0; int out_start_idx = start_idx * 2; for (int idx = 0; idx < stop_idx - start_idx; idx++) { int cur_row = rows[idx + start_idx]; int cur_col = cols[idx + start_idx]; - T cur_val = vals[idx + start_idx]; + T cur_val = vals[idx + start_idx]; int lookup_row = cur_col; - int t_start = row_ind[lookup_row]; // Start at - int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); + int t_start = row_ind[lookup_row]; // Start at + int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind); T transpose = 0.0; @@ -77,7 +85,7 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, // done in a different thread. if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) { // If it exists already, set transposed value to existing value - transpose = vals[t_idx]; + transpose = vals[t_idx]; found_match = true; break; } @@ -123,9 +131,11 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols, * @param stream: cuda stream to use */ template -void coo_symmetrize(COO *in, COO *out, +void coo_symmetrize(COO* in, + COO* out, Lambda reduction_op, // two-argument reducer - cudaStream_t stream) { + cudaStream_t stream) +{ dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); @@ -137,9 +147,16 @@ void coo_symmetrize(COO *in, COO *out, out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream); - coo_symmetrize_kernel<<>>( - in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(), - out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op); + coo_symmetrize_kernel<<>>(in_row_ind.data(), + in->rows(), + in->cols(), + in->vals(), + out->rows(), + out->cols(), + out->vals(), + in->n_rows, + in->nnz, + reduction_op); CUDA_CHECK(cudaPeekAtLastError()); } @@ -155,14 +172,15 @@ void coo_symmetrize(COO *in, COO *out, * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction */ template -__global__ static void symmetric_find_size(const value_t *restrict data, - const value_idx *restrict indices, - const value_idx n, const int k, - value_idx *restrict row_sizes, - value_idx *restrict row_sizes2) { +__global__ static void symmetric_find_size(const value_t* restrict data, + const value_idx* restrict indices, + const value_idx n, + const int k, + value_idx* restrict row_sizes, + value_idx* restrict row_sizes2) +{ const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = - blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; const auto col = indices[row * k + j]; @@ -182,9 +200,11 @@ __global__ static void symmetric_find_size(const value_t *restrict data, * @param row_sizes2: Input row sum 2 array(n) for faster reduction */ template -__global__ static void reduce_find_size(const value_idx n, const int k, - value_idx *restrict row_sizes, - const value_idx *restrict row_sizes2) { +__global__ static void reduce_find_size(const value_idx n, + const int k, + value_idx* restrict row_sizes, + const value_idx* restrict row_sizes2) +{ const auto i = (blockIdx.x * blockDim.x) + threadIdx.x; if (i >= n) return; row_sizes[i] += (row_sizes2[i] + k); @@ -205,20 +225,21 @@ __global__ static void reduce_find_size(const value_idx n, const int k, * @param k: Number of n_neighbors */ template -__global__ static void symmetric_sum(value_idx *restrict edges, - const value_t *restrict data, - const value_idx *restrict indices, - value_t *restrict VAL, - value_idx *restrict COL, - value_idx *restrict ROW, const value_idx n, - const int k) { +__global__ static void symmetric_sum(value_idx* restrict edges, + const value_t* restrict data, + const value_idx* restrict indices, + value_t* restrict VAL, + value_idx* restrict COL, + value_idx* restrict ROW, + const value_idx n, + const int k) +{ const auto row = blockIdx.x * blockDim.x + threadIdx.x; // for every row - const auto j = - blockIdx.y * blockDim.y + threadIdx.y; // for every item in row + const auto j = blockIdx.y * blockDim.y + threadIdx.y; // for every item in row if (row >= n || j >= k) return; - const auto col = indices[row * k + j]; - const auto original = atomicAdd(&edges[row], value_idx(1)); + const auto col = indices[row * k + j]; + const auto original = atomicAdd(&edges[row], value_idx(1)); const auto transpose = atomicAdd(&edges[col], value_idx(1)); VAL[transpose] = VAL[original] = data[row * k + j]; @@ -247,27 +268,25 @@ __global__ static void symmetric_sum(value_idx *restrict edges, * @param out: Output COO Matrix class * @param stream: Input cuda stream */ -template -void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, - const value_t *restrict knn_dists, - const value_idx n, const int k, - COO *out, - cudaStream_t stream) { +template +void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices, + const value_t* restrict knn_dists, + const value_idx n, + const int k, + COO* out, + cudaStream_t stream) +{ // (1) Find how much space needed in each row // We look through all datapoints and increment the count for each row. const dim3 threadsPerBlock(TPB_X, TPB_Y); - const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), - raft::ceildiv(k, TPB_Y)); + const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), raft::ceildiv(k, TPB_Y)); // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4) rmm::device_uvector row_sizes(n, stream); - CUDA_CHECK( - cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream)); rmm::device_uvector row_sizes2(n, stream); - CUDA_CHECK( - cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); + CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream)); symmetric_find_size<<>>( knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data()); @@ -288,14 +307,12 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, // This mirrors CSR matrix's row Pointer, were maximum bounds for each row // are calculated as the cumulative rolling sum of the previous rows. // Notice reusing old row_sizes2 memory - value_idx *edges = row_sizes2.data(); - thrust::device_ptr __edges = thrust::device_pointer_cast(edges); - thrust::device_ptr __row_sizes = - thrust::device_pointer_cast(row_sizes.data()); + value_idx* edges = row_sizes2.data(); + thrust::device_ptr __edges = thrust::device_pointer_cast(edges); + thrust::device_ptr __row_sizes = thrust::device_pointer_cast(row_sizes.data()); // Rolling cumulative sum - thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n, - __edges); + thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n, __edges); // (5) Perform final data + data.T operation in tandem with memcpying symmetric_sum<<>>( @@ -307,9 +324,15 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices, * Symmetrizes a COO matrix */ template -void symmetrize(const raft::handle_t &handle, const value_idx *rows, - const value_idx *cols, const value_t *vals, size_t m, size_t n, - size_t nnz, raft::sparse::COO &out) { +void symmetrize(const raft::handle_t& handle, + const value_idx* rows, + const value_idx* cols, + const value_t* vals, + size_t m, + size_t n, + size_t nnz, + raft::sparse::COO& out) +{ auto stream = handle.get_stream(); // copy rows to cols and cols to rows @@ -326,13 +349,16 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows, raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream); // sort COO - raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2, - symm_rows.data(), symm_cols.data(), - symm_vals.data(), stream); - - raft::sparse::op::max_duplicates(handle, out, symm_rows.data(), - symm_cols.data(), symm_vals.data(), nnz * 2, - m, n); + raft::sparse::op::coo_sort((value_idx)m, + (value_idx)n, + (value_idx)nnz * 2, + symm_rows.data(), + symm_cols.data(), + symm_vals.data(), + stream); + + raft::sparse::op::max_duplicates( + handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h index 7ad4b93ec0..e3a9b1fbd9 100644 --- a/cpp/include/raft/sparse/linalg/transpose.h +++ b/cpp/include/raft/sparse/linalg/transpose.h @@ -55,27 +55,53 @@ namespace linalg { * @param[in] stream : Cuda stream for ordering events */ template -void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr, - const value_idx *csr_indices, const value_t *csr_data, - value_idx *csc_indptr, value_idx *csc_indices, - value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols, - value_idx nnz, cudaStream_t stream) { +void csr_transpose(cusparseHandle_t handle, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data, + value_idx* csc_indptr, + value_idx* csc_indices, + value_t* csc_data, + value_idx csr_nrows, + value_idx csr_ncols, + value_idx nnz, + cudaStream_t stream) +{ size_t convert_csc_workspace_size = 0; - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize( - handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, - csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, - &convert_csc_workspace_size, stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + &convert_csc_workspace_size, + stream)); - rmm::device_uvector convert_csc_workspace(convert_csc_workspace_size, - stream); + rmm::device_uvector convert_csc_workspace(convert_csc_workspace_size, stream); - CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc( - handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices, - csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1, - convert_csc_workspace.data(), stream)); + CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle, + csr_nrows, + csr_ncols, + nnz, + csr_data, + csr_indptr, + csr_indices, + csc_data, + csc_indptr, + csc_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + convert_csc_workspace.data(), + stream)); } }; // end NAMESPACE linalg diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh index f0d30b0cb7..36d426029b 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh @@ -28,10 +28,16 @@ namespace mst { namespace detail { template -__global__ void kernel_min_edge_per_vertex( - const edge_t* offsets, const vertex_t* indices, const alteration_t* weights, - const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, - const bool* mst_edge, alteration_t* min_edge_color, const vertex_t v) { +__global__ void kernel_min_edge_per_vertex(const edge_t* offsets, + const vertex_t* indices, + const alteration_t* weights, + const vertex_t* color, + const vertex_t* color_index, + edge_t* new_mst_edge, + const bool* mst_edge, + alteration_t* min_edge_color, + const vertex_t v) +{ edge_t tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned warp_id = tid / 32; @@ -41,14 +47,14 @@ __global__ void kernel_min_edge_per_vertex( __shared__ alteration_t min_edge_weight[32]; __shared__ vertex_t min_color[32]; - min_edge_index[lane_id] = std::numeric_limits::max(); + min_edge_index[lane_id] = std::numeric_limits::max(); min_edge_weight[lane_id] = std::numeric_limits::max(); - min_color[lane_id] = std::numeric_limits::max(); + min_color[lane_id] = std::numeric_limits::max(); __syncthreads(); vertex_t self_color_idx = color_index[warp_id]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // find the minimum edge associated per row // each thread in warp holds the minimum edge for @@ -56,20 +62,20 @@ __global__ void kernel_min_edge_per_vertex( if (warp_id < v) { // one row is associated with one warp edge_t row_start = offsets[warp_id]; - edge_t row_end = offsets[warp_id + 1]; + edge_t row_end = offsets[warp_id + 1]; // assuming one warp per row // find min for each thread in warp for (edge_t e = row_start + lane_id; e < row_end; e += 32) { alteration_t curr_edge_weight = weights[e]; - vertex_t successor_color_idx = color_index[indices[e]]; - vertex_t successor_color = color[successor_color_idx]; + vertex_t successor_color_idx = color_index[indices[e]]; + vertex_t successor_color = color[successor_color_idx]; if (!mst_edge[e] && self_color != successor_color) { if (curr_edge_weight < min_edge_weight[lane_id]) { - min_color[lane_id] = successor_color; + min_color[lane_id] = successor_color; min_edge_weight[lane_id] = curr_edge_weight; - min_edge_index[lane_id] = e; + min_edge_index[lane_id] = e; } } } @@ -82,9 +88,9 @@ __global__ void kernel_min_edge_per_vertex( for (int offset = 16; offset > 0; offset >>= 1) { if (lane_id < offset) { if (min_edge_weight[lane_id] > min_edge_weight[lane_id + offset]) { - min_color[lane_id] = min_color[lane_id + offset]; + min_color[lane_id] = min_color[lane_id + offset]; min_edge_weight[lane_id] = min_edge_weight[lane_id + offset]; - min_edge_index[lane_id] = min_edge_index[lane_id + offset]; + min_edge_index[lane_id] = min_edge_index[lane_id + offset]; } } __syncthreads(); @@ -102,19 +108,26 @@ __global__ void kernel_min_edge_per_vertex( } } -template -__global__ void min_edge_per_supervertex( - const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge, - bool* mst_edge, const vertex_t* indices, const weight_t* weights, - const alteration_t* altered_weights, vertex_t* temp_src, vertex_t* temp_dst, - weight_t* temp_weights, const alteration_t* min_edge_color, const vertex_t v, - bool symmetrize_output) { +template +__global__ void min_edge_per_supervertex(const vertex_t* color, + const vertex_t* color_index, + edge_t* new_mst_edge, + bool* mst_edge, + const vertex_t* indices, + const weight_t* weights, + const alteration_t* altered_weights, + vertex_t* temp_src, + vertex_t* temp_dst, + weight_t* temp_weights, + const alteration_t* min_edge_color, + const vertex_t v, + bool symmetrize_output) +{ auto tid = get_1D_idx(); if (tid < v) { vertex_t vertex_color_idx = color_index[tid]; - vertex_t vertex_color = color[vertex_color_idx]; - edge_t edge_idx = new_mst_edge[tid]; + vertex_t vertex_color = color[vertex_color_idx]; + edge_t edge_idx = new_mst_edge[tid]; // check if valid outgoing edge was found // find minimum edge is same as minimum edge of whole supervertex @@ -129,32 +142,27 @@ __global__ void min_edge_per_supervertex( auto dst = indices[edge_idx]; if (!symmetrize_output) { auto dst_edge_idx = new_mst_edge[dst]; - auto dst_color = color[color_index[dst]]; + auto dst_color = color[color_index[dst]]; // vertices added each other // only if destination has found an edge // the edge points back to source // the edge is minimum edge found for dst color - if (dst_edge_idx != std::numeric_limits::max() && - indices[dst_edge_idx] == tid && + if (dst_edge_idx != std::numeric_limits::max() && indices[dst_edge_idx] == tid && min_edge_color[dst_color] == altered_weights[dst_edge_idx]) { - if (vertex_color > dst_color) { - add_edge = false; - } + if (vertex_color > dst_color) { add_edge = false; } } } if (add_edge) { - temp_src[tid] = tid; - temp_dst[tid] = dst; - temp_weights[tid] = weights[edge_idx]; + temp_src[tid] = tid; + temp_dst[tid] = dst; + temp_weights[tid] = weights[edge_idx]; mst_edge[edge_idx] = true; } } - if (!add_edge) { - new_mst_edge[tid] = std::numeric_limits::max(); - } + if (!add_edge) { new_mst_edge[tid] = std::numeric_limits::max(); } } } } @@ -162,9 +170,13 @@ __global__ void min_edge_per_supervertex( template __global__ void add_reverse_edge(const edge_t* new_mst_edge, const vertex_t* indices, - const weight_t* weights, vertex_t* temp_src, - vertex_t* temp_dst, weight_t* temp_weights, - const vertex_t v, bool symmetrize_output) { + const weight_t* weights, + vertex_t* temp_src, + vertex_t* temp_dst, + weight_t* temp_weights, + const vertex_t v, + bool symmetrize_output) +{ auto tid = get_1D_idx(); if (tid < v) { @@ -186,9 +198,7 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // if vertices did not pick each other // add a reverse edge - if (tid != neighbor_vertex_neighbor) { - reverse_needed = true; - } + if (tid != neighbor_vertex_neighbor) { reverse_needed = true; } } } @@ -197,8 +207,8 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // it is assumed the each vertex only picks one valid min edge // per cycle // hence, we store at index tid + v for the reverse edge scenario - temp_src[tid + v] = neighbor_vertex; - temp_dst[tid + v] = tid; + temp_src[tid + v] = neighbor_vertex; + temp_dst[tid + v] = tid; temp_weights[tid + v] = weights[edge_idx]; } } @@ -207,11 +217,13 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge, // executes for newly added mst edges and updates the colors of both vertices to the lower color template -__global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, +__global__ void min_pair_colors(const vertex_t v, + const vertex_t* indices, const edge_t* new_mst_edge, const vertex_t* color, const vertex_t* color_index, - vertex_t* next_color) { + vertex_t* next_color) +{ auto i = get_1D_idx(); if (i < v) { @@ -220,9 +232,9 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, if (edge_idx != std::numeric_limits::max()) { vertex_t neighbor_vertex = indices[edge_idx]; // vertex_t self_color = color[i]; - vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; - vertex_t neighbor_color_idx = color_index[neighbor_vertex]; + vertex_t self_color_idx = color_index[i]; + vertex_t self_color = color[self_color_idx]; + vertex_t neighbor_color_idx = color_index[neighbor_vertex]; vertex_t neighbor_super_color = color[neighbor_color_idx]; // update my own color as source of edge @@ -238,33 +250,36 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices, // for each vertex, update color if it was changed in min_pair_colors kernel template -__global__ void update_colors(const vertex_t v, vertex_t* color, +__global__ void update_colors(const vertex_t v, + vertex_t* color, const vertex_t* color_index, - const vertex_t* next_color, bool* done) { + const vertex_t* next_color, + bool* done) +{ auto i = get_1D_idx(); if (i < v) { - vertex_t self_color = color[i]; + vertex_t self_color = color[i]; vertex_t self_color_idx = color_index[i]; - vertex_t new_color = next_color[self_color_idx]; + vertex_t new_color = next_color[self_color_idx]; // update self color to new smaller color if (self_color > new_color) { color[i] = new_color; - *done = false; + *done = false; } } } // point vertices to their final color index template -__global__ void final_color_indices(const vertex_t v, const vertex_t* color, - vertex_t* color_index) { +__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index) +{ auto i = get_1D_idx(); if (i < v) { vertex_t self_color_idx = color_index[i]; - vertex_t self_color = color[self_color_idx]; + vertex_t self_color = color[self_color_idx]; // if self color is not equal to self color index, // it means self is not supervertex @@ -272,7 +287,7 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, // parent supervertex while (self_color_idx != self_color) { self_color_idx = color_index[self_color]; - self_color = color[self_color_idx]; + self_color = color[self_color_idx]; } // point to new supervertex @@ -282,22 +297,23 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu // Consider using curand device API instead of precomputed random_values array -template -__global__ void alteration_kernel(const vertex_t v, const edge_t e, +template +__global__ void alteration_kernel(const vertex_t v, + const edge_t e, const edge_t* offsets, const vertex_t* indices, - const weight_t* weights, alteration_t max, + const weight_t* weights, + alteration_t max, alteration_t* random_values, - alteration_t* altered_weights) { + alteration_t* altered_weights) +{ auto row = get_1D_idx(); if (row < v) { auto row_begin = offsets[row]; - auto row_end = offsets[row + 1]; + auto row_end = offsets[row + 1]; for (auto i = row_begin; i < row_end; i++) { - auto column = indices[i]; - altered_weights[i] = - weights[i] + max * (random_values[row] + random_values[column]); + auto column = indices[i]; + altered_weights[i] = weights[i] + max * (random_values[row] + random_values[column]); } } } @@ -305,17 +321,15 @@ __global__ void alteration_kernel(const vertex_t v, const edge_t e, template __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src, edge_t* mst_edge_count, - const vertex_t v) { + const vertex_t v) +{ auto tid = get_1D_idx(); // count number of new mst edges added - bool predicate = - tid < v && (mst_src[tid] != std::numeric_limits::max()); + bool predicate = tid < v && (mst_src[tid] != std::numeric_limits::max()); vertex_t block_count = __syncthreads_count(predicate); - if (threadIdx.x == 0 && block_count > 0) { - atomicAdd(mst_edge_count, block_count); - } + if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); } } } // namespace detail diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh index 33b980afcd..5591e15b19 100644 --- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh +++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh @@ -40,21 +40,30 @@ typedef std::chrono::high_resolution_clock Clock; // curand generator uniform inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - float* outputPtr, size_t n) { + float* outputPtr, + size_t n) +{ return curandGenerateUniform(generator, outputPtr, n); } inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator, - double* outputPtr, size_t n) { + double* outputPtr, + size_t n) +{ return curandGenerateUniformDouble(generator, outputPtr, n); } -template -MST_solver::MST_solver( - const raft::handle_t& handle_, const edge_t* offsets_, - const vertex_t* indices_, const weight_t* weights_, const vertex_t v_, - const edge_t e_, vertex_t* color_, cudaStream_t stream_, - bool symmetrize_output_, bool initialize_colors_, int iterations_) +template +MST_solver::MST_solver(const raft::handle_t& handle_, + const edge_t* offsets_, + const vertex_t* indices_, + const weight_t* weights_, + const vertex_t v_, + const edge_t e_, + vertex_t* color_, + cudaStream_t stream_, + bool symmetrize_output_, + bool initialize_colors_, + int iterations_) : handle(handle_), offsets(offsets_), indices(indices_), @@ -76,17 +85,17 @@ MST_solver::MST_solver( stream(stream_), symmetrize_output(symmetrize_output_), initialize_colors(initialize_colors_), - iterations(iterations_) { - max_blocks = handle_.get_device_properties().maxGridSize[0]; + iterations(iterations_) +{ + max_blocks = handle_.get_device_properties().maxGridSize[0]; max_threads = handle_.get_device_properties().maxThreadsPerBlock; - sm_count = handle_.get_device_properties().multiProcessorCount; + sm_count = handle_.get_device_properties().multiProcessorCount; mst_edge_count.set_value_to_zero_async(stream); prev_mst_edge_count.set_value_to_zero_async(stream); - CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), - stream)); + CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), stream)); - //Initially, color holds the vertex id as color + // Initially, color holds the vertex id as color auto policy = handle.get_thrust_policy(); if (initialize_colors_) { thrust::sequence(policy, color.begin(), color.end(), 0); @@ -97,10 +106,10 @@ MST_solver::MST_solver( thrust::sequence(policy, next_color.begin(), next_color.end(), 0); } -template +template raft::Graph_COO -MST_solver::solve() { +MST_solver::solve() +{ RAFT_EXPECTS(v > 0, "0 vertices"); RAFT_EXPECTS(e > 0, "0 edges"); RAFT_EXPECTS(offsets != nullptr, "Null offsets."); @@ -113,12 +122,13 @@ MST_solver::solve() { // Alterating the weights // this is done by identifying the lowest cost edge weight gap that is not 0, call this theta. - // For each edge, add noise that is less than theta. That is, generate a random number in the range [0.0, theta) and add it to each edge weight. + // For each edge, add noise that is less than theta. That is, generate a random number in the + // range [0.0, theta) and add it to each edge weight. alteration(); #ifdef MST_TIME auto stop = Clock::now(); - timer0 = duration_us(stop - start); + timer0 = duration_us(stop - start); #endif auto max_mst_edges = symmetrize_output ? 2 * v - 2 : v - 1; @@ -167,8 +177,8 @@ MST_solver::solve() { if (curr_mst_edge_count == prev_mst_edge_count.value(stream)) { #ifdef MST_TIME std::cout << "Iterations: " << i << std::endl; - std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 - << "," << timer4 << "," << timer5 << std::endl; + std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 << "," << timer4 << "," + << timer5 << std::endl; #endif // exit here when reaching steady state break; @@ -178,8 +188,7 @@ MST_solver::solve() { start = Clock::now(); #endif // append the newly found MST edges to the final output - append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), - mst_result.weights.data()); + append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), mst_result.weights.data()); #ifdef MST_TIME stop = Clock::now(); timer4 += duration_us(stop - start); @@ -210,50 +219,46 @@ MST_solver::solve() { // ||y|-|x|| template struct alteration_functor { - __host__ __device__ weight_t - operator()(const thrust::tuple& t) { + __host__ __device__ weight_t operator()(const thrust::tuple& t) + { auto x = thrust::get<0>(t); auto y = thrust::get<1>(t); - x = x < 0 ? -x : x; - y = y < 0 ? -y : y; + x = x < 0 ? -x : x; + y = y < 0 ? -y : y; return x < y ? y - x : x - y; } }; // Compute the uper bound for the alteration -template -alteration_t -MST_solver::alteration_max() { +template +alteration_t MST_solver::alteration_max() +{ auto policy = handle.get_thrust_policy(); rmm::device_uvector tmp(e, stream); thrust::device_ptr weights_ptr(weights); thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin()); - //sort tmp weights + // sort tmp weights thrust::sort(policy, tmp.begin(), tmp.end()); - //remove duplicates + // remove duplicates auto new_end = thrust::unique(policy, tmp.begin(), tmp.end()); - //min(a[i+1]-a[i])/2 - auto begin = - thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); - auto end = - thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); - auto init = tmp.element(1, stream) - tmp.element(0, stream); - auto max = - thrust::transform_reduce(policy, begin, end, alteration_functor(), - init, thrust::minimum()); + // min(a[i+1]-a[i])/2 + auto begin = thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1)); + auto end = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end)); + auto init = tmp.element(1, stream) - tmp.element(0, stream); + auto max = thrust::transform_reduce( + policy, begin, end, alteration_functor(), init, thrust::minimum()); return max / static_cast(2); } // Compute the alteration to make all undirected edge weight unique // Preserves weights order -template -void MST_solver::alteration() { +template +void MST_solver::alteration() +{ auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); // maximum alteration that does not change realtive weights order alteration_t max = alteration_max(); @@ -270,34 +275,32 @@ void MST_solver::alteration() { auto curand_status = curand_generate_uniformX(randGen, rand_values.data(), v); RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed"); curand_status = curandDestroyGenerator(randGen); - RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, - "MST: CURAND cleanup failed"); + RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND cleanup failed"); - //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu + // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu detail::alteration_kernel<<>>( - v, e, offsets, indices, weights, max, rand_values.data(), - altered_weights.data()); + v, e, offsets, indices, weights, max, rand_values.data(), altered_weights.data()); } // updates colors of vertices by propagating the lower color to the higher -template -void MST_solver::label_prop( - vertex_t* mst_src, vertex_t* mst_dst) { +template +void MST_solver::label_prop(vertex_t* mst_src, + vertex_t* mst_dst) +{ // update the colors of both ends its until there is no change in colors edge_t curr_mst_edge_count = mst_edge_count.value(stream); auto min_pair_nthreads = std::min(v, (vertex_t)max_threads); - auto min_pair_nblocks = std::min( - (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); + auto min_pair_nblocks = + std::min((v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks); edge_t* new_mst_edge_ptr = new_mst_edge.data(); - vertex_t* color_ptr = color.data(); + vertex_t* color_ptr = color.data(); vertex_t* next_color_ptr = next_color.data(); rmm::device_scalar done(stream); done.set_value_to_zero_async(stream); - bool* done_ptr = done.data(); + bool* done_ptr = done.data(); const bool true_val = true; auto i = 0; @@ -312,84 +315,99 @@ void MST_solver::label_prop( i++; } - detail:: - final_color_indices<<>>( - v, color_ptr, color_index); + detail::final_color_indices<<>>( + v, color_ptr, color_index); #ifdef MST_TIME std::cout << "Label prop iterations: " << i << std::endl; #endif } // Finds the minimum edge from each vertex to the lowest color -template -void MST_solver::min_edge_per_vertex() { +template +void MST_solver::min_edge_per_vertex() +{ auto policy = handle.get_thrust_policy(); - thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(), - std::numeric_limits::max()); - thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(), - std::numeric_limits::max()); + thrust::fill( + policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits::max()); + thrust::fill( + policy, new_mst_edge.begin(), new_mst_edge.end(), std::numeric_limits::max()); int n_threads = 32; - vertex_t* color_ptr = color.data(); - edge_t* new_mst_edge_ptr = new_mst_edge.data(); - bool* mst_edge_ptr = mst_edge.data(); - alteration_t* min_edge_color_ptr = min_edge_color.data(); + vertex_t* color_ptr = color.data(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + bool* mst_edge_ptr = mst_edge.data(); + alteration_t* min_edge_color_ptr = min_edge_color.data(); alteration_t* altered_weights_ptr = altered_weights.data(); - detail::kernel_min_edge_per_vertex<<>>( - offsets, indices, altered_weights_ptr, color_ptr, color_index, - new_mst_edge_ptr, mst_edge_ptr, min_edge_color_ptr, v); + detail::kernel_min_edge_per_vertex<<>>(offsets, + indices, + altered_weights_ptr, + color_ptr, + color_index, + new_mst_edge_ptr, + mst_edge_ptr, + min_edge_color_ptr, + v); } // Finds the minimum edge from each supervertex to the lowest color -template -void MST_solver::min_edge_per_supervertex() { +template +void MST_solver::min_edge_per_supervertex() +{ auto nthreads = std::min(v, max_threads); - auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); + auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks); auto policy = handle.get_thrust_policy(); - thrust::fill(policy, temp_src.begin(), temp_src.end(), - std::numeric_limits::max()); + thrust::fill(policy, temp_src.begin(), temp_src.end(), std::numeric_limits::max()); - vertex_t* color_ptr = color.data(); - edge_t* new_mst_edge_ptr = new_mst_edge.data(); - bool* mst_edge_ptr = mst_edge.data(); - alteration_t* min_edge_color_ptr = min_edge_color.data(); + vertex_t* color_ptr = color.data(); + edge_t* new_mst_edge_ptr = new_mst_edge.data(); + bool* mst_edge_ptr = mst_edge.data(); + alteration_t* min_edge_color_ptr = min_edge_color.data(); alteration_t* altered_weights_ptr = altered_weights.data(); - vertex_t* temp_src_ptr = temp_src.data(); - vertex_t* temp_dst_ptr = temp_dst.data(); - weight_t* temp_weights_ptr = temp_weights.data(); - - detail::min_edge_per_supervertex<<>>( - color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights, - altered_weights_ptr, temp_src_ptr, temp_dst_ptr, temp_weights_ptr, - min_edge_color_ptr, v, symmetrize_output); + vertex_t* temp_src_ptr = temp_src.data(); + vertex_t* temp_dst_ptr = temp_dst.data(); + weight_t* temp_weights_ptr = temp_weights.data(); + + detail::min_edge_per_supervertex<<>>(color_ptr, + color_index, + new_mst_edge_ptr, + mst_edge_ptr, + indices, + weights, + altered_weights_ptr, + temp_src_ptr, + temp_dst_ptr, + temp_weights_ptr, + min_edge_color_ptr, + v, + symmetrize_output); // the above kernel only adds directed mst edges in the case where // a pair of vertices don't pick the same min edge between them // so, now we add the reverse edge to make it undirected if (symmetrize_output) { - detail::add_reverse_edge<<>>( - new_mst_edge_ptr, indices, weights, temp_src_ptr, temp_dst_ptr, - temp_weights_ptr, v, symmetrize_output); + detail::add_reverse_edge<<>>(new_mst_edge_ptr, + indices, + weights, + temp_src_ptr, + temp_dst_ptr, + temp_weights_ptr, + v, + symmetrize_output); } } -template -void MST_solver::check_termination() { +template +void MST_solver::check_termination() +{ vertex_t nthreads = std::min(2 * v, (vertex_t)max_threads); - vertex_t nblocks = - std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); + vertex_t nblocks = std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks); // count number of new mst edges edge_t* mst_edge_count_ptr = mst_edge_count.data(); - vertex_t* temp_src_ptr = temp_src.data(); + vertex_t* temp_src_ptr = temp_src.data(); detail::kernel_count_new_mst_edges<<>>( temp_src_ptr, mst_edge_count_ptr, 2 * v); @@ -397,36 +415,40 @@ void MST_solver::check_termination() { template struct new_edges_functor { - __host__ __device__ bool operator()( - const thrust::tuple& t) { + __host__ __device__ bool operator()(const thrust::tuple& t) + { auto src = thrust::get<0>(t); return src != std::numeric_limits::max() ? true : false; } }; -template +template void MST_solver::append_src_dst_pair( - vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) { + vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) +{ auto policy = handle.get_thrust_policy(); edge_t curr_mst_edge_count = prev_mst_edge_count.value(stream); // iterator to end of mst edges added to final output in previous iteration - auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple( - mst_src + curr_mst_edge_count, mst_dst + curr_mst_edge_count, - mst_weights + curr_mst_edge_count)); + auto src_dst_zip_end = + thrust::make_zip_iterator(thrust::make_tuple(mst_src + curr_mst_edge_count, + mst_dst + curr_mst_edge_count, + mst_weights + curr_mst_edge_count)); // iterator to new mst edges found - auto temp_src_dst_zip_begin = thrust::make_zip_iterator(thrust::make_tuple( - temp_src.begin(), temp_dst.begin(), temp_weights.begin())); + auto temp_src_dst_zip_begin = thrust::make_zip_iterator( + thrust::make_tuple(temp_src.begin(), temp_dst.begin(), temp_weights.begin())); auto temp_src_dst_zip_end = thrust::make_zip_iterator( thrust::make_tuple(temp_src.end(), temp_dst.end(), temp_weights.end())); // copy new mst edges to final output - thrust::copy_if(policy, temp_src_dst_zip_begin, temp_src_dst_zip_end, - src_dst_zip_end, new_edges_functor()); + thrust::copy_if(policy, + temp_src_dst_zip_begin, + temp_src_dst_zip_end, + src_dst_zip_end, + new_edges_functor()); } } // namespace mst diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh index 4d5ca6ebe1..97a76e1d50 100644 --- a/cpp/include/raft/sparse/mst/detail/utils.cuh +++ b/cpp/include/raft/sparse/mst/detail/utils.cuh @@ -26,32 +26,29 @@ namespace mst { namespace detail { template -__device__ idx_t get_1D_idx() { +__device__ idx_t get_1D_idx() +{ return blockIdx.x * blockDim.x + threadIdx.x; } // somewhat smart vector print template -void printv(rmm::device_uvector& vec, const std::string& name = "", - const size_t displ = 5) { +void printv(rmm::device_uvector& vec, const std::string& name = "", const size_t displ = 5) +{ #ifdef MST_TIME std::cout.precision(15); std::cout << name << " size = " << vec.size() << std::endl; if (displ < vec.size()) { - thrust::copy(vec.begin(), vec.begin() + displ, - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.begin() + displ, std::ostream_iterator(std::cout, " ")); std::cout << " ... "; - thrust::copy(vec.end() - displ, vec.end(), - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.end() - displ, vec.end(), std::ostream_iterator(std::cout, " ")); } else { - thrust::copy(vec.begin(), vec.end(), - std::ostream_iterator(std::cout, " ")); + thrust::copy(vec.begin(), vec.end(), std::ostream_iterator(std::cout, " ")); } std::cout << std::endl << std::endl; #endif } -#define duration_us(a) \ - std::chrono::duration_cast(a).count() +#define duration_us(a) std::chrono::duration_cast(a).count() } // namespace detail } // namespace mst diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh index 10c981445e..b49003467b 100644 --- a/cpp/include/raft/sparse/mst/mst.cuh +++ b/cpp/include/raft/sparse/mst/mst.cuh @@ -22,16 +22,30 @@ namespace raft { namespace mst { -template -raft::Graph_COO mst( - const raft::handle_t& handle, edge_t const* offsets, vertex_t const* indices, - weight_t const* weights, vertex_t const v, edge_t const e, vertex_t* color, - cudaStream_t stream, bool symmetrize_output = true, - bool initialize_colors = true, int iterations = 0) { - MST_solver mst_solver( - handle, offsets, indices, weights, v, e, color, stream, symmetrize_output, - initialize_colors, iterations); +template +raft::Graph_COO mst(const raft::handle_t& handle, + edge_t const* offsets, + vertex_t const* indices, + weight_t const* weights, + vertex_t const v, + edge_t const e, + vertex_t* color, + cudaStream_t stream, + bool symmetrize_output = true, + bool initialize_colors = true, + int iterations = 0) +{ + MST_solver mst_solver(handle, + offsets, + indices, + weights, + v, + e, + color, + stream, + symmetrize_output, + initialize_colors, + iterations); return mst_solver.solve(); } diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh index 44b34ee5c7..bae5d77d8e 100644 --- a/cpp/include/raft/sparse/mst/mst_solver.cuh +++ b/cpp/include/raft/sparse/mst/mst_solver.cuh @@ -31,20 +31,27 @@ struct Graph_COO { edge_t n_edges; Graph_COO(vertex_t size, cudaStream_t stream) - : src(size, stream), dst(size, stream), weights(size, stream) {} + : src(size, stream), dst(size, stream), weights(size, stream) + { + } }; namespace mst { -template +template class MST_solver { public: - MST_solver(const raft::handle_t& handle_, const edge_t* offsets_, - const vertex_t* indices_, const weight_t* weights_, - const vertex_t v_, const edge_t e_, vertex_t* color_, - cudaStream_t stream_, bool symmetrize_output_, - bool initialize_colors_, int iterations_); + MST_solver(const raft::handle_t& handle_, + const edge_t* offsets_, + const vertex_t* indices_, + const weight_t* weights_, + const vertex_t v_, + const edge_t e_, + vertex_t* color_, + cudaStream_t stream_, + bool symmetrize_output_, + bool initialize_colors_, + int iterations_); raft::Graph_COO solve(); @@ -56,7 +63,7 @@ class MST_solver { bool symmetrize_output, initialize_colors; int iterations; - //CSR + // CSR const edge_t* offsets; const vertex_t* indices; const weight_t* weights; @@ -67,20 +74,16 @@ class MST_solver { vertex_t max_threads; vertex_t sm_count; - vertex_t* color_index; // represent each supervertex as a color - rmm::device_uvector - min_edge_color; // minimum incident edge weight per color - rmm::device_uvector new_mst_edge; // new minimum edge per vertex - rmm::device_uvector - altered_weights; // weights to be used for mst + vertex_t* color_index; // represent each supervertex as a color + rmm::device_uvector min_edge_color; // minimum incident edge weight per color + rmm::device_uvector new_mst_edge; // new minimum edge per vertex + rmm::device_uvector altered_weights; // weights to be used for mst + rmm::device_scalar mst_edge_count; // total number of edges added after every iteration rmm::device_scalar - mst_edge_count; // total number of edges added after every iteration - rmm::device_scalar - prev_mst_edge_count; // total number of edges up to the previous iteration - rmm::device_uvector - mst_edge; // mst output - true if the edge belongs in mst + prev_mst_edge_count; // total number of edges up to the previous iteration + rmm::device_uvector mst_edge; // mst output - true if the edge belongs in mst rmm::device_uvector next_color; // next iteration color - rmm::device_uvector color; // index of color that vertex points to + rmm::device_uvector color; // index of color that vertex points to // new src-dst pairs found per iteration rmm::device_uvector temp_src; @@ -93,8 +96,7 @@ class MST_solver { void check_termination(); void alteration(); alteration_t alteration_max(); - void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, - weight_t* mst_weights); + void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights); }; } // namespace mst diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh index 492058f85f..8bc8c746f9 100644 --- a/cpp/include/raft/sparse/op/filter.cuh +++ b/cpp/include/raft/sparse/op/filter.cuh @@ -42,15 +42,23 @@ namespace sparse { namespace op { template -__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, - const T *vals, int nnz, int *crows, - int *ccols, T *cvals, int *ex_scan, - int *cur_ex_scan, int m, T scalar) { +__global__ void coo_remove_scalar_kernel(const int* rows, + const int* cols, + const T* vals, + int nnz, + int* crows, + int* ccols, + T* cvals, + int* ex_scan, + int* cur_ex_scan, + int m, + T scalar) +{ int row = (blockIdx.x * TPB_X) + threadIdx.x; if (row < m) { - int start = cur_ex_scan[row]; - int stop = get_stop_idx(row, m, nnz, cur_ex_scan); + int start = cur_ex_scan[row]; + int stop = get_stop_idx(row, m, nnz, cur_ex_scan); int cur_out_idx = ex_scan[row]; for (int idx = start; idx < stop; idx++) { @@ -82,35 +90,49 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols, * @param stream: cuda stream to use */ template -void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, - int *crows, int *ccols, T *cvals, int *cnnz, - int *cur_cnnz, T scalar, int n, cudaStream_t stream) { +void coo_remove_scalar(const int* rows, + const int* cols, + const T* vals, + int nnz, + int* crows, + int* ccols, + T* cvals, + int* cnnz, + int* cur_cnnz, + T scalar, + int n, + cudaStream_t stream) +{ rmm::device_uvector ex_scan(n, stream); rmm::device_uvector cur_ex_scan(n, stream); CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream)); - thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); - thrust::device_ptr dev_ex_scan = - thrust::device_pointer_cast(ex_scan.data()); - thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, - dev_ex_scan); + thrust::device_ptr dev_cnnz = thrust::device_pointer_cast(cnnz); + thrust::device_ptr dev_ex_scan = thrust::device_pointer_cast(ex_scan.data()); + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); - thrust::device_ptr dev_cur_ex_scan = - thrust::device_pointer_cast(cur_ex_scan.data()); - thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, - dev_cur_cnnz + n, dev_cur_ex_scan); + thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz); + thrust::device_ptr dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data()); + thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); CUDA_CHECK(cudaPeekAtLastError()); dim3 grid(raft::ceildiv(n, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); - coo_remove_scalar_kernel<<>>( - rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(), - dev_cur_ex_scan.get(), n, scalar); + coo_remove_scalar_kernel<<>>(rows, + cols, + vals, + nnz, + crows, + ccols, + cvals, + dev_ex_scan.get(), + dev_cur_ex_scan.get(), + n, + scalar); CUDA_CHECK(cudaPeekAtLastError()); } @@ -123,33 +145,39 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz, * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO *in, COO *out, T scalar, cudaStream_t stream) { +void coo_remove_scalar(COO* in, COO* out, T scalar, cudaStream_t stream) +{ rmm::device_uvector row_count_nz(in->n_rows, stream); rmm::device_uvector row_count(in->n_rows, stream); - CUDA_CHECK( - cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream)); linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, - row_count_nz.data(), stream); + linalg::coo_degree_scalar( + in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream); CUDA_CHECK(cudaPeekAtLastError()); - thrust::device_ptr d_row_count_nz = - thrust::device_pointer_cast(row_count_nz.data()); - int out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, - d_row_count_nz + in->n_rows); + thrust::device_ptr d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data()); + int out_nnz = + thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows); out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream); - coo_remove_scalar(in->rows(), in->cols(), in->vals(), in->nnz, - out->rows(), out->cols(), out->vals(), - row_count_nz.data(), row_count.data(), scalar, - in->n_rows, stream); + coo_remove_scalar(in->rows(), + in->cols(), + in->vals(), + in->nnz, + out->rows(), + out->cols(), + out->vals(), + row_count_nz.data(), + row_count.data(), + scalar, + in->n_rows, + stream); CUDA_CHECK(cudaPeekAtLastError()); } @@ -161,7 +189,8 @@ void coo_remove_scalar(COO *in, COO *out, T scalar, cudaStream_t stream) { * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO *in, COO *out, cudaStream_t stream) { +void coo_remove_zeros(COO* in, COO* out, cudaStream_t stream) +{ coo_remove_scalar(in, out, T(0.0), stream); } diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh index 09a35720fb..84d584d108 100644 --- a/cpp/include/raft/sparse/op/reduce.cuh +++ b/cpp/include/raft/sparse/op/reduce.cuh @@ -44,25 +44,29 @@ namespace sparse { namespace op { template -__global__ void compute_duplicates_diffs_kernel(const value_idx *rows, - const value_idx *cols, - value_idx *diff, size_t nnz) { +__global__ void compute_duplicates_diffs_kernel(const value_idx* rows, + const value_idx* cols, + value_idx* diff, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; value_idx d = 1; - if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) - d = 0; + if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) d = 0; diff[tid] = d; } template -__global__ void max_duplicates_kernel(const value_idx *src_rows, - const value_idx *src_cols, - const value_t *src_vals, - const value_idx *index, - value_idx *out_rows, value_idx *out_cols, - value_t *out_vals, size_t nnz) { +__global__ void max_duplicates_kernel(const value_idx* src_rows, + const value_idx* src_cols, + const value_t* src_vals, + const value_idx* index, + value_idx* out_rows, + value_idx* out_cols, + value_t* out_vals, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < nnz) { @@ -94,13 +98,13 @@ __global__ void max_duplicates_kernel(const value_idx *src_rows, * @param[in] stream cuda ops will be ordered wrt this stream */ template -void compute_duplicates_mask(value_idx *mask, const value_idx *rows, - const value_idx *cols, size_t nnz, - cudaStream_t stream) { +void compute_duplicates_mask( + value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream) +{ CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream)); - compute_duplicates_diffs_kernel<<>>(rows, cols, mask, nnz); + compute_duplicates_diffs_kernel<<>>( + rows, cols, mask, nnz); } /** @@ -120,11 +124,16 @@ void compute_duplicates_mask(value_idx *mask, const value_idx *rows, * @param[in] stream cuda ops will be ordered wrt this stream */ template -void max_duplicates(const raft::handle_t &handle, - raft::sparse::COO &out, - const value_idx *rows, const value_idx *cols, - const value_t *vals, size_t nnz, size_t m, size_t n) { - auto stream = handle.get_stream(); +void max_duplicates(const raft::handle_t& handle, + raft::sparse::COO& out, + const value_idx* rows, + const value_idx* cols, + const value_t* vals, + size_t nnz, + size_t m, + size_t n) +{ + auto stream = handle.get_stream(); auto thrust_policy = handle.get_thrust_policy(); // compute diffs & take exclusive scan @@ -132,8 +141,7 @@ void max_duplicates(const raft::handle_t &handle, compute_duplicates_mask(diff.data(), rows, cols, nnz, stream); - thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), - diff.data()); + thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), diff.data()); // compute final size value_idx size = 0; diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh index 9e5034dc28..194a878ac1 100644 --- a/cpp/include/raft/sparse/op/row_op.cuh +++ b/cpp/include/raft/sparse/op/row_op.cuh @@ -38,12 +38,12 @@ namespace sparse { namespace op { template void> -__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, - Lambda op) { +__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op) +{ T row = blockIdx.x * TPB_X + threadIdx.x; if (row < n_rows) { T start_idx = row_ind[row]; - T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; + T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz; op(row, start_idx, stop_idx); } } @@ -59,14 +59,12 @@ __global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz, * @param op custom row operation functor accepting the row and beginning index. * @param stream cuda stream to use */ -template void> -void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op, - cudaStream_t stream) { +template void> +void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream) +{ dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1); dim3 blk(TPB_X, 1, 1); - csr_row_op_kernel - <<>>(row_ind, n_rows, nnz, op); + csr_row_op_kernel<<>>(row_ind, n_rows, nnz, op); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h index 46f4f41879..9bbe04cf34 100644 --- a/cpp/include/raft/sparse/op/slice.h +++ b/cpp/include/raft/sparse/op/slice.h @@ -50,10 +50,14 @@ namespace op { * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, - const value_idx *indptr, value_idx *indptr_out, - value_idx *start_offset, value_idx *stop_offset, - cudaStream_t stream) { +void csr_row_slice_indptr(value_idx start_row, + value_idx stop_row, + const value_idx* indptr, + value_idx* indptr_out, + value_idx* start_offset, + value_idx* stop_offset, + cudaStream_t stream) +{ raft::update_host(start_offset, indptr + start_row, 1, stream); raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream); @@ -63,11 +67,12 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1, // we add another 1 to stop row. - raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, - stream); + raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream); raft::linalg::unaryOp( - indptr_out, indptr_out, (stop_row + 2) - start_row, + indptr_out, + indptr_out, + (stop_row + 2) - start_row, [s_offset] __device__(value_idx input) { return input - s_offset; }, stream); } @@ -85,12 +90,15 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row, * @param[in] stream : cuda stream for ordering events */ template -void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset, - const value_idx *indices, const value_t *data, - value_idx *indices_out, value_t *data_out, - cudaStream_t stream) { - raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, - stream); +void csr_row_slice_populate(value_idx start_offset, + value_idx stop_offset, + const value_idx* indices, + const value_t* data, + value_idx* indices_out, + value_t* data_out, + cudaStream_t stream) +{ + raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, stream); raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream); } diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h index c40801a0b1..d397bce780 100644 --- a/cpp/include/raft/sparse/op/sort.h +++ b/cpp/include/raft/sparse/op/sort.h @@ -38,7 +38,8 @@ namespace op { struct TupleComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -61,13 +62,12 @@ struct TupleComp { * @param stream: cuda stream to use */ template -void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, - cudaStream_t stream) { +void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream) +{ auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz, - vals, TupleComp()); + thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz, vals, TupleComp()); } /** @@ -77,9 +77,9 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals, * @param stream: the cuda stream to use */ template -void coo_sort(COO *const in, cudaStream_t stream) { - coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), - in->vals(), stream); +void coo_sort(COO* const in, cudaStream_t stream) +{ + coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream); } /** @@ -93,8 +93,9 @@ void coo_sort(COO *const in, cudaStream_t stream) { * @param[in] stream cuda stream for which to order cuda operations */ template -void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data, - value_idx nnz, cudaStream_t stream) { +void coo_sort_by_weight( + value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream) +{ thrust::device_ptr t_data = thrust::device_pointer_cast(data); auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols)); diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh index 5313b81192..8edb0e8b43 100644 --- a/cpp/include/raft/sparse/selection/connect_components.cuh +++ b/cpp/include/raft/sparse/selection/connect_components.cuh @@ -59,17 +59,20 @@ struct KeyValuePair { __host__ __device__ __forceinline__ KeyValuePair() {} /// Copy Constructor - __host__ __device__ __forceinline__ - KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) - : key(kvp.key), value(kvp.value) {} + __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) + : key(kvp.key), value(kvp.value) + { + } /// Constructor - __host__ __device__ __forceinline__ KeyValuePair(Key const &key, - Value const &value) - : key(key), value(value) {} + __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) + : key(key), value(value) + { + } /// Inequality operator - __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair &b) { + __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) + { return (value != b.value) || (key != b.key); } }; @@ -83,31 +86,32 @@ struct KeyValuePair { */ template struct FixConnectivitiesRedOp { - value_idx *colors; + value_idx* colors; value_idx m; - FixConnectivitiesRedOp(value_idx *colors_, value_idx m_) - : colors(colors_), m(m_){}; + FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){}; typedef typename cub::KeyValuePair KVP; - DI void operator()(value_idx rit, KVP *out, const KVP &other) { - if (rit < m && other.value < out->value && - colors[rit] != colors[other.key]) { - out->key = other.key; + DI void operator()(value_idx rit, KVP* out, const KVP& other) + { + if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) { + out->key = other.key; out->value = other.value; } } - DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) { + DI KVP operator()(value_idx rit, const KVP& a, const KVP& b) + { if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) { return a; } else return b; } - DI void init(value_t *out, value_t maxVal) { *out = maxVal; } - DI void init(KVP *out, value_t maxVal) { - out->key = -1; + DI void init(value_t* out, value_t maxVal) { *out = maxVal; } + DI void init(KVP* out, value_t maxVal) + { + out->key = -1; out->value = maxVal; } }; @@ -119,7 +123,8 @@ struct FixConnectivitiesRedOp { */ struct TupleComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's color, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -137,13 +142,9 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - DI KVP operator()(const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce @@ -158,11 +159,10 @@ struct CubKVPMinReduce { * @return total number of components */ template -value_idx get_n_components(value_idx *colors, size_t n_rows, - cudaStream_t stream) { +value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream) +{ rmm::device_uvector map_ids(0, stream); - int num_clusters = - raft::label::getUniquelabels(map_ids, colors, n_rows, stream); + int num_clusters = raft::label::getUniquelabels(map_ids, colors, n_rows, stream); return num_clusters; } @@ -173,11 +173,12 @@ value_idx get_n_components(value_idx *colors, size_t n_rows, */ template struct LookupColorOp { - value_idx *colors; + value_idx* colors; - LookupColorOp(value_idx *colors_) : colors(colors_) {} + LookupColorOp(value_idx* colors_) : colors(colors_) {} - DI value_idx operator()(const cub::KeyValuePair &kvp) { + DI value_idx operator()(const cub::KeyValuePair& kvp) + { return colors[kvp.key]; } }; @@ -187,7 +188,8 @@ struct LookupColorOp { * the given array of components * @tparam value_idx * @tparam value_t - * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given array of components + * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given + * array of components * @param[out] nn_colors components of nearest neighbors for each vertex * @param[in] colors components of each vertex * @param[in] X original dense data @@ -196,24 +198,38 @@ struct LookupColorOp { * @param[in] stream cuda stream for which to order cuda operations */ template -void perform_1nn(cub::KeyValuePair *kvp, - value_idx *nn_colors, value_idx *colors, const value_t *X, - size_t n_rows, size_t n_cols, cudaStream_t stream, - red_op reduction_op) { +void perform_1nn(cub::KeyValuePair* kvp, + value_idx* nn_colors, + value_idx* colors, + const value_t* X, + size_t n_rows, + size_t n_cols, + cudaStream_t stream, + red_op reduction_op) +{ rmm::device_uvector workspace(n_rows, stream); rmm::device_uvector x_norm(n_rows, stream); - raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, - true, stream); - - raft::distance::fusedL2NN, - value_idx>( - kvp, X, X, x_norm.data(), x_norm.data(), n_rows, n_rows, n_cols, - workspace.data(), reduction_op, reduction_op, true, true, stream); + raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream); + + raft::distance::fusedL2NN, value_idx>( + kvp, + X, + X, + x_norm.data(), + x_norm.data(), + n_rows, + n_rows, + n_cols, + workspace.data(), + reduction_op, + reduction_op, + true, + true, + stream); LookupColorOp extract_colors_op(colors); - thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, - extract_colors_op); + thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op); } /** @@ -229,27 +245,33 @@ void perform_1nn(cub::KeyValuePair *kvp, * @param stream stream for which to order CUDA operations */ template -void sort_by_color(value_idx *colors, value_idx *nn_colors, - cub::KeyValuePair *kvp, - value_idx *src_indices, size_t n_rows, cudaStream_t stream) { +void sort_by_color(value_idx* colors, + value_idx* nn_colors, + cub::KeyValuePair* kvp, + value_idx* src_indices, + size_t n_rows, + cudaStream_t stream) +{ thrust::counting_iterator arg_sort_iter(0); - thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, - src_indices); + thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices); - auto keys = thrust::make_zip_iterator(thrust::make_tuple( - colors, nn_colors, (raft::linkage::KeyValuePair *)kvp)); + auto keys = thrust::make_zip_iterator( + thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair*)kvp)); auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices)); // get all the colors in contiguous locations so we can map them to warps. - thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, - TupleComp()); + thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, TupleComp()); } template -__global__ void min_components_by_color_kernel( - value_idx *out_rows, value_idx *out_cols, value_t *out_vals, - const value_idx *out_index, const value_idx *indices, - const cub::KeyValuePair *kvp, size_t nnz) { +__global__ void min_components_by_color_kernel(value_idx* out_rows, + value_idx* out_cols, + value_t* out_vals, + const value_idx* out_index, + const value_idx* indices, + const cub::KeyValuePair* kvp, + size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -278,19 +300,20 @@ __global__ void min_components_by_color_kernel( * @param[in] stream cuda stream for which to order cuda operations */ template -void min_components_by_color(raft::sparse::COO &coo, - const value_idx *out_index, - const value_idx *indices, - const cub::KeyValuePair *kvp, - size_t nnz, cudaStream_t stream) { +void min_components_by_color(raft::sparse::COO& coo, + const value_idx* out_index, + const value_idx* indices, + const cub::KeyValuePair* kvp, + size_t nnz, + cudaStream_t stream) +{ /** * Arrays should be ordered by: colors_indptr->colors_n->kvp.value * so the last element of each column in the input CSR should be * the min. */ - min_components_by_color_kernel<<>>(coo.rows(), coo.cols(), coo.vals(), - out_index, indices, kvp, nnz); + min_components_by_color_kernel<<>>( + coo.rows(), coo.cols(), coo.vals(), out_index, indices, kvp, nnz); } /** @@ -312,12 +335,16 @@ void min_components_by_color(raft::sparse::COO &coo, * @param[in] n_cols number of cols in X */ template -void connect_components(const raft::handle_t &handle, - raft::sparse::COO &out, - const value_t *X, const value_idx *orig_colors, - size_t n_rows, size_t n_cols, red_op reduction_op, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded) { +void connect_components( + const raft::handle_t& handle, + raft::sparse::COO& out, + const value_t* X, + const value_idx* orig_colors, + size_t n_rows, + size_t n_cols, + red_op reduction_op, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded) +{ auto stream = handle.get_stream(); RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, @@ -328,8 +355,7 @@ void connect_components(const raft::handle_t &handle, raft::copy_async(colors.data(), orig_colors, n_rows, stream); // Normalize colors so they are drawn from a monotonically increasing set - raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, - true); + raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, true); value_idx n_components = get_n_components(colors.data(), n_rows, stream); @@ -338,36 +364,42 @@ void connect_components(const raft::handle_t &handle, * is guaranteed to be != color of its nearest neighbor. */ rmm::device_uvector nn_colors(n_rows, stream); - rmm::device_uvector> temp_inds_dists( - n_rows, stream); + rmm::device_uvector> temp_inds_dists(n_rows, stream); rmm::device_uvector src_indices(n_rows, stream); - perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X, - n_rows, n_cols, stream, reduction_op); + perform_1nn(temp_inds_dists.data(), + nn_colors.data(), + colors.data(), + X, + n_rows, + n_cols, + stream, + reduction_op); /** * Sort data points by color (neighbors are not sorted) */ // max_color + 1 = number of connected components // sort nn_colors by key w/ original colors - sort_by_color(colors.data(), nn_colors.data(), temp_inds_dists.data(), - src_indices.data(), n_rows, stream); + sort_by_color( + colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream); /** * Take the min for any duplicate colors */ // Compute mask of duplicates rmm::device_uvector out_index(n_rows + 1, stream); - raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(), - nn_colors.data(), n_rows, stream); + raft::sparse::op::compute_duplicates_mask( + out_index.data(), colors.data(), nn_colors.data(), n_rows, stream); - thrust::exclusive_scan(handle.get_thrust_policy(), out_index.data(), - out_index.data() + out_index.size(), out_index.data()); + thrust::exclusive_scan(handle.get_thrust_policy(), + out_index.data(), + out_index.data() + out_index.size(), + out_index.data()); // compute final size value_idx size = 0; - raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, - stream); + raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); size++; @@ -375,14 +407,14 @@ void connect_components(const raft::handle_t &handle, raft::sparse::COO min_edges(stream); min_edges.allocate(size, n_rows, n_rows, true, stream); - min_components_by_color(min_edges, out_index.data(), src_indices.data(), - temp_inds_dists.data(), n_rows, stream); + min_components_by_color( + min_edges, out_index.data(), src_indices.data(), temp_inds_dists.data(), n_rows, stream); /** * Symmetrize resulting edge list */ - raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(), - min_edges.vals(), n_rows, n_rows, size, out); + raft::sparse::linalg::symmetrize( + handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out); } }; // end namespace linkage diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index b796b63dc8..8486abd863 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -38,9 +38,11 @@ namespace selection { template struct csr_batcher_t { - csr_batcher_t(value_idx batch_size, value_idx n_rows, - const value_idx *csr_indptr, const value_idx *csr_indices, - const value_t *csr_data) + csr_batcher_t(value_idx batch_size, + value_idx n_rows, + const value_idx* csr_indptr, + const value_idx* csr_indices, + const value_t* csr_data) : batch_start_(0), batch_stop_(0), batch_rows_(0), @@ -50,32 +52,42 @@ struct csr_batcher_t { csr_indices_(csr_indices), csr_data_(csr_data), batch_csr_start_offset_(0), - batch_csr_stop_offset_(0) {} + batch_csr_stop_offset_(0) + { + } - void set_batch(int batch_num) { + void set_batch(int batch_num) + { batch_start_ = batch_num * batch_size_; - batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing + batch_stop_ = batch_start_ + batch_size_ - 1; // zero-based indexing - if (batch_stop_ >= total_rows_) - batch_stop_ = total_rows_ - 1; // zero-based indexing + if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1; // zero-based indexing batch_rows_ = (batch_stop_ - batch_start_) + 1; } - value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_indptr( - batch_start_, batch_stop_, csr_indptr_, batch_indptr, - &batch_csr_start_offset_, &batch_csr_stop_offset_, stream); + value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_indptr(batch_start_, + batch_stop_, + csr_indptr_, + batch_indptr, + &batch_csr_start_offset_, + &batch_csr_stop_offset_, + stream); return batch_csr_stop_offset_ - batch_csr_start_offset_; } - void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data, - cudaStream_t stream) { - raft::sparse::op::csr_row_slice_populate( - batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_, - csr_indices, csr_data, stream); + void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream) + { + raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_, + batch_csr_stop_offset_, + csr_indices_, + csr_data_, + csr_indices, + csr_data, + stream); } value_idx batch_rows() const { return batch_rows_; } @@ -92,9 +104,9 @@ struct csr_batcher_t { value_idx total_rows_; - const value_idx *csr_indptr_; - const value_idx *csr_indices_; - const value_t *csr_data_; + const value_idx* csr_indptr_; + const value_idx* csr_indices_; + const value_t* csr_data_; value_idx batch_csr_start_offset_; value_idx batch_csr_stop_offset_; @@ -103,18 +115,26 @@ struct csr_batcher_t { template class sparse_knn_t { public: - sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_, - const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_, - int n_idx_cols_, const value_idx *queryIndptr_, - const value_idx *queryIndices_, const value_t *queryData_, - size_t queryNNZ_, int n_query_rows_, int n_query_cols_, - value_idx *output_indices_, value_t *output_dists_, int k_, - const raft::handle_t &handle_, - size_t batch_size_index_ = 2 << 14, // approx 1M - size_t batch_size_query_ = 2 << 14, - raft::distance::DistanceType metric_ = - raft::distance::DistanceType::L2Expanded, - float metricArg_ = 0) + sparse_knn_t(const value_idx* idxIndptr_, + const value_idx* idxIndices_, + const value_t* idxData_, + size_t idxNNZ_, + int n_idx_rows_, + int n_idx_cols_, + const value_idx* queryIndptr_, + const value_idx* queryIndices_, + const value_t* queryData_, + size_t queryNNZ_, + int n_query_rows_, + int n_query_cols_, + value_idx* output_indices_, + value_t* output_dists_, + int k_, + const raft::handle_t& handle_, + size_t batch_size_index_ = 2 << 14, // approx 1M + size_t batch_size_query_ = 2 << 14, + raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded, + float metricArg_ = 0) : idxIndptr(idxIndptr_), idxIndices(idxIndices_), idxData(idxData_), @@ -134,9 +154,12 @@ class sparse_knn_t { batch_size_index(batch_size_index_), batch_size_query(batch_size_query_), metric(metric_), - metricArg(metricArg_) {} + metricArg(metricArg_) + { + } - void run() { + void run() + { using namespace raft::sparse; int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query); @@ -147,37 +170,33 @@ class sparse_knn_t { for (int i = 0; i < n_batches_query; i++) { /** - * Compute index batch info - */ + * Compute index batch info + */ query_batcher.set_batch(i); /** - * Slice CSR to rows in batch - */ + * Slice CSR to rows in batch + */ - rmm::device_uvector query_batch_indptr( - query_batcher.batch_rows() + 1, handle.get_stream()); + rmm::device_uvector query_batch_indptr(query_batcher.batch_rows() + 1, + handle.get_stream()); - value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz( - query_batch_indptr.data(), handle.get_stream()); + value_idx n_query_batch_nnz = + query_batcher.get_batch_csr_indptr_nnz(query_batch_indptr.data(), handle.get_stream()); - rmm::device_uvector query_batch_indices(n_query_batch_nnz, - handle.get_stream()); - rmm::device_uvector query_batch_data(n_query_batch_nnz, - handle.get_stream()); + rmm::device_uvector query_batch_indices(n_query_batch_nnz, handle.get_stream()); + rmm::device_uvector query_batch_data(n_query_batch_nnz, handle.get_stream()); - query_batcher.get_batch_csr_indices_data(query_batch_indices.data(), - query_batch_data.data(), - handle.get_stream()); + query_batcher.get_batch_csr_indices_data( + query_batch_indices.data(), query_batch_data.data(), handle.get_stream()); // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent // batches and 1 space for the results of the merge, which get copied back to the top - rmm::device_uvector merge_buffer_indices(0, - handle.get_stream()); + rmm::device_uvector merge_buffer_indices(0, handle.get_stream()); rmm::device_uvector merge_buffer_dists(0, handle.get_stream()); - value_t *dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_ptr; + value_t* dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_ptr; int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index); csr_batcher_t idx_batcher( @@ -186,22 +205,19 @@ class sparse_knn_t { for (int j = 0; j < n_batches_idx; j++) { idx_batcher.set_batch(j); - merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); - merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, - handle.get_stream()); + merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); + merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, handle.get_stream()); /** - * Slice CSR to rows in batch - */ - rmm::device_uvector idx_batch_indptr( - idx_batcher.batch_rows() + 1, handle.get_stream()); - rmm::device_uvector idx_batch_indices(0, - handle.get_stream()); + * Slice CSR to rows in batch + */ + rmm::device_uvector idx_batch_indptr(idx_batcher.batch_rows() + 1, + handle.get_stream()); + rmm::device_uvector idx_batch_indices(0, handle.get_stream()); rmm::device_uvector idx_batch_data(0, handle.get_stream()); - value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz( - idx_batch_indptr.data(), handle.get_stream()); + value_idx idx_batch_nnz = + idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), handle.get_stream()); idx_batch_indices.resize(idx_batch_nnz, handle.get_stream()); idx_batch_data.resize(idx_batch_nnz, handle.get_stream()); @@ -210,111 +226,126 @@ class sparse_knn_t { idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream()); /** - * Compute distances - */ - size_t dense_size = - idx_batcher.batch_rows() * query_batcher.batch_rows(); - rmm::device_uvector batch_dists(dense_size, - handle.get_stream()); - - CUDA_CHECK(cudaMemset(batch_dists.data(), 0, - batch_dists.size() * sizeof(value_t))); - - compute_distances(idx_batcher, query_batcher, idx_batch_nnz, - n_query_batch_nnz, idx_batch_indptr.data(), - idx_batch_indices.data(), idx_batch_data.data(), - query_batch_indptr.data(), query_batch_indices.data(), - query_batch_data.data(), batch_dists.data()); + * Compute distances + */ + size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows(); + rmm::device_uvector batch_dists(dense_size, handle.get_stream()); + + CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t))); + + compute_distances(idx_batcher, + query_batcher, + idx_batch_nnz, + n_query_batch_nnz, + idx_batch_indptr.data(), + idx_batch_indices.data(), + idx_batch_data.data(), + query_batch_indptr.data(), + query_batch_indices.data(), + query_batch_data.data(), + batch_dists.data()); // Build batch indices array - rmm::device_uvector batch_indices(batch_dists.size(), - handle.get_stream()); + rmm::device_uvector batch_indices(batch_dists.size(), handle.get_stream()); // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); - iota_fill(batch_indices.data(), batch_rows, batch_cols, - handle.get_stream()); + iota_fill(batch_indices.data(), batch_rows, batch_cols, handle.get_stream()); /** * Perform k-selection on batch & merge with other k-selections */ size_t merge_buffer_offset = batch_rows * k; - dists_merge_buffer_ptr = - merge_buffer_dists.data() + merge_buffer_offset; - indices_merge_buffer_ptr = - merge_buffer_indices.data() + merge_buffer_offset; - - perform_k_selection(idx_batcher, query_batcher, batch_dists.data(), - batch_indices.data(), dists_merge_buffer_ptr, + dists_merge_buffer_ptr = merge_buffer_dists.data() + merge_buffer_offset; + indices_merge_buffer_ptr = merge_buffer_indices.data() + merge_buffer_offset; + + perform_k_selection(idx_batcher, + query_batcher, + batch_dists.data(), + batch_indices.data(), + dists_merge_buffer_ptr, indices_merge_buffer_ptr); - value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; - value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; + value_t* dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr; + value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr; // Merge results of difference batches if necessary if (idx_batcher.batch_start() > 0) { - size_t merge_buffer_tmp_out = batch_rows * k * 2; - dists_merge_buffer_tmp_ptr = - merge_buffer_dists.data() + merge_buffer_tmp_out; - indices_merge_buffer_tmp_ptr = - merge_buffer_indices.data() + merge_buffer_tmp_out; - - merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(), - merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr, + size_t merge_buffer_tmp_out = batch_rows * k * 2; + dists_merge_buffer_tmp_ptr = merge_buffer_dists.data() + merge_buffer_tmp_out; + indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out; + + merge_batches(idx_batcher, + query_batcher, + merge_buffer_dists.data(), + merge_buffer_indices.data(), + dists_merge_buffer_tmp_ptr, indices_merge_buffer_tmp_ptr); } // copy merged output back into merge buffer partition for next iteration raft::copy_async(merge_buffer_indices.data(), indices_merge_buffer_tmp_ptr, - batch_rows * k, handle.get_stream()); + batch_rows * k, + handle.get_stream()); raft::copy_async(merge_buffer_dists.data(), - dists_merge_buffer_tmp_ptr, batch_rows * k, + dists_merge_buffer_tmp_ptr, + batch_rows * k, handle.get_stream()); } // Copy final merged batch to output array - raft::copy_async( - output_indices + (rows_processed * k), merge_buffer_indices.data(), - query_batcher.batch_rows() * k, handle.get_stream()); - raft::copy_async( - output_dists + (rows_processed * k), merge_buffer_dists.data(), - query_batcher.batch_rows() * k, handle.get_stream()); + raft::copy_async(output_indices + (rows_processed * k), + merge_buffer_indices.data(), + query_batcher.batch_rows() * k, + handle.get_stream()); + raft::copy_async(output_dists + (rows_processed * k), + merge_buffer_dists.data(), + query_batcher.batch_rows() * k, + handle.get_stream()); rows_processed += query_batcher.batch_rows(); } } private: - void merge_batches(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - value_t *merge_buffer_dists, - value_idx *merge_buffer_indices, value_t *out_dists, - value_idx *out_indices) { + void merge_batches(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + value_t* merge_buffer_dists, + value_idx* merge_buffer_indices, + value_t* out_dists, + value_idx* out_indices) + { // build translation buffer to shift resulting indices by the batch std::vector id_ranges; id_ranges.push_back(0); id_ranges.push_back(idx_batcher.batch_start()); rmm::device_uvector trans(id_ranges.size(), handle.get_stream()); - raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), - handle.get_stream()); + raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), handle.get_stream()); // combine merge buffers only if there's more than 1 partition to combine - raft::spatial::knn::knn_merge_parts( - merge_buffer_dists, merge_buffer_indices, out_dists, out_indices, - query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data()); + raft::spatial::knn::knn_merge_parts(merge_buffer_dists, + merge_buffer_indices, + out_dists, + out_indices, + query_batcher.batch_rows(), + 2, + k, + handle.get_stream(), + trans.data()); } void perform_k_selection(csr_batcher_t idx_batcher, csr_batcher_t query_batcher, - value_t *batch_dists, value_idx *batch_indices, - value_t *out_dists, value_idx *out_indices) { + value_t* batch_dists, + value_idx* batch_indices, + value_t* out_dists, + value_idx* out_indices) + { // populate batch indices array - value_idx batch_rows = query_batcher.batch_rows(), - batch_cols = idx_batcher.batch_rows(); + value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows(); // build translation buffer to shift resulting indices by the batch std::vector id_ranges; @@ -329,52 +360,60 @@ class sparse_knn_t { if (metric == raft::distance::DistanceType::InnerProduct) ascending = false; // kernel to slice first (min) k cols and copy into batched merge buffer - raft::spatial::knn::select_k(batch_dists, batch_indices, batch_rows, - batch_cols, out_dists, out_indices, ascending, - n_neighbors, handle.get_stream()); + raft::spatial::knn::select_k(batch_dists, + batch_indices, + batch_rows, + batch_cols, + out_dists, + out_indices, + ascending, + n_neighbors, + handle.get_stream()); } - void compute_distances(csr_batcher_t &idx_batcher, - csr_batcher_t &query_batcher, - size_t idx_batch_nnz, size_t query_batch_nnz, - value_idx *idx_batch_indptr, - value_idx *idx_batch_indices, value_t *idx_batch_data, - value_idx *query_batch_indptr, - value_idx *query_batch_indices, - value_t *query_batch_data, value_t *batch_dists) { + void compute_distances(csr_batcher_t& idx_batcher, + csr_batcher_t& query_batcher, + size_t idx_batch_nnz, + size_t query_batch_nnz, + value_idx* idx_batch_indptr, + value_idx* idx_batch_indices, + value_t* idx_batch_data, + value_idx* query_batch_indptr, + value_idx* query_batch_indices, + value_t* query_batch_data, + value_t* batch_dists) + { /** * Compute distances */ - raft::sparse::distance::distances_config_t dist_config( - handle); + raft::sparse::distance::distances_config_t dist_config(handle); dist_config.b_nrows = idx_batcher.batch_rows(); dist_config.b_ncols = n_idx_cols; - dist_config.b_nnz = idx_batch_nnz; + dist_config.b_nnz = idx_batch_nnz; - dist_config.b_indptr = idx_batch_indptr; + dist_config.b_indptr = idx_batch_indptr; dist_config.b_indices = idx_batch_indices; - dist_config.b_data = idx_batch_data; + dist_config.b_data = idx_batch_data; dist_config.a_nrows = query_batcher.batch_rows(); dist_config.a_ncols = n_query_cols; - dist_config.a_nnz = query_batch_nnz; + dist_config.a_nnz = query_batch_nnz; - dist_config.a_indptr = query_batch_indptr; + dist_config.a_indptr = query_batch_indptr; dist_config.a_indices = query_batch_indices; - dist_config.a_data = query_batch_data; + dist_config.a_data = query_batch_data; if (raft::sparse::distance::supportedDistance.find(metric) == raft::sparse::distance::supportedDistance.end()) THROW("DistanceType not supported: %d", metric); - raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, - metricArg); + raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg); } const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices; - value_idx *output_indices; + value_idx* output_indices; const value_t *idxData, *queryData; - value_t *output_dists; + value_t* output_dists; size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query; @@ -384,50 +423,74 @@ class sparse_knn_t { int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k; - const raft::handle_t &handle; + const raft::handle_t& handle; }; /** - * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors - * using some distance implementation - * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) - * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) - * @param[in] idxData csr data array of the index matrix (size idxNNZ) - * @param[in] idxNNA number of non-zeros for sparse index matrix - * @param[in] n_idx_rows number of data samples in index matrix - * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) - * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) - * @param[in] queryData csr data array of the query matrix (size queryNNZ) - * @param[in] queryNNZ number of non-zeros for sparse query matrix - * @param[in] n_query_rows number of data samples in query matrix - * @param[in] n_query_cols number of features in query matrix - * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) - * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) - * @param[in] k the number of neighbors to query - * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to - * @param[in] batch_size_index maximum number of rows to use from index matrix per batch - * @param[in] batch_size_query maximum number of rows to use from query matrix per batch - * @param[in] metric distance metric/measure to use - * @param[in] metricArg potential argument for metric (currently unused) - */ + * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors + * using some distance implementation + * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1) + * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz) + * @param[in] idxData csr data array of the index matrix (size idxNNZ) + * @param[in] idxNNA number of non-zeros for sparse index matrix + * @param[in] n_idx_rows number of data samples in index matrix + * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1) + * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ) + * @param[in] queryData csr data array of the query matrix (size queryNNZ) + * @param[in] queryNNZ number of non-zeros for sparse query matrix + * @param[in] n_query_rows number of data samples in query matrix + * @param[in] n_query_cols number of features in query matrix + * @param[out] output_indices dense matrix for output indices (size n_query_rows * k) + * @param[out] output_dists dense matrix for output distances (size n_query_rows * k) + * @param[in] k the number of neighbors to query + * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to + * @param[in] batch_size_index maximum number of rows to use from index matrix per batch + * @param[in] batch_size_query maximum number of rows to use from query matrix per batch + * @param[in] metric distance metric/measure to use + * @param[in] metricArg potential argument for metric (currently unused) + */ template -void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices, - const value_t *idxData, size_t idxNNZ, int n_idx_rows, - int n_idx_cols, const value_idx *queryIndptr, - const value_idx *queryIndices, const value_t *queryData, - size_t queryNNZ, int n_query_rows, int n_query_cols, - value_idx *output_indices, value_t *output_dists, int k, - const raft::handle_t &handle, - size_t batch_size_index = 2 << 14, // approx 1M - size_t batch_size_query = 2 << 14, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2Expanded, - float metricArg = 0) { - sparse_knn_t( - idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr, - queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols, - output_indices, output_dists, k, handle, batch_size_index, batch_size_query, - metric, metricArg) +void brute_force_knn(const value_idx* idxIndptr, + const value_idx* idxIndices, + const value_t* idxData, + size_t idxNNZ, + int n_idx_rows, + int n_idx_cols, + const value_idx* queryIndptr, + const value_idx* queryIndices, + const value_t* queryData, + size_t queryNNZ, + int n_query_rows, + int n_query_cols, + value_idx* output_indices, + value_t* output_dists, + int k, + const raft::handle_t& handle, + size_t batch_size_index = 2 << 14, // approx 1M + size_t batch_size_query = 2 << 14, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, + float metricArg = 0) +{ + sparse_knn_t(idxIndptr, + idxIndices, + idxData, + idxNNZ, + n_idx_rows, + n_idx_cols, + queryIndptr, + queryIndices, + queryData, + queryNNZ, + n_query_rows, + n_query_cols, + output_indices, + output_dists, + k, + handle, + batch_size_index, + batch_size_query, + metric, + metricArg) .run(); } diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh index 3df1c77081..f13c43c306 100644 --- a/cpp/include/raft/sparse/selection/knn_graph.cuh +++ b/cpp/include/raft/sparse/selection/knn_graph.cuh @@ -45,31 +45,34 @@ namespace selection { * @param m */ template -__global__ void fill_indices(value_idx *indices, size_t m, size_t nnz) { +__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz) +{ value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid >= nnz) return; - value_idx v = tid / m; + value_idx v = tid / m; indices[tid] = v; } template -value_idx build_k(value_idx n_samples, int c) { +value_idx build_k(value_idx n_samples, int c) +{ // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering // approach on GPU" - return min(n_samples, - max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); + return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c)); } template -__global__ void conv_indices_kernel(in_t *inds, out_t *out, size_t nnz) { +__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz) +{ size_t tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; - out_t v = inds[tid]; + out_t v = inds[tid]; out[tid] = v; } template -void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { +void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream) +{ size_t blocks = ceildiv(size, (size_t)tpb); conv_indices_kernel<<>>(inds, out, size); } @@ -92,9 +95,14 @@ void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) { * @param c */ template -void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, +void knn_graph(const handle_t& handle, + const value_t* X, + size_t m, + size_t n, raft::distance::DistanceType metric, - raft::sparse::COO &out, int c = 15) { + raft::sparse::COO& out, + int c = 15) +{ int k = build_k(m, c); auto stream = handle.get_stream(); @@ -108,8 +116,8 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, size_t blocks = ceildiv(nnz, (size_t)256); fill_indices<<>>(rows.data(), k, nnz); - std::vector inputs; - inputs.push_back(const_cast(X)); + std::vector inputs; + inputs.push_back(const_cast(X)); std::vector sizes; sizes.push_back(m); @@ -119,15 +127,25 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n, rmm::device_uvector int64_indices(nnz, stream); uint32_t knn_start = curTimeMillis(); - raft::spatial::knn::brute_force_knn( - handle, inputs, sizes, n, const_cast(X), m, int64_indices.data(), - data.data(), k, true, true, nullptr, metric); + raft::spatial::knn::brute_force_knn(handle, + inputs, + sizes, + n, + const_cast(X), + m, + int64_indices.data(), + data.data(), + k, + true, + true, + nullptr, + metric); // convert from current knn's 64-bit to 32-bit. conv_indices(int64_indices.data(), indices.data(), nnz, stream); - raft::sparse::linalg::symmetrize(handle, rows.data(), indices.data(), - data.data(), m, k, nnz, out); + raft::sparse::linalg::symmetrize( + handle, rows.data(), indices.data(), data.data(), m, k, nnz, out); } }; // namespace selection diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h index 63578bf1f3..56e8832e0a 100644 --- a/cpp/include/raft/sparse/utils.h +++ b/cpp/include/raft/sparse/utils.h @@ -26,7 +26,8 @@ namespace sparse { * @param[in] ncols number of blocks to quantize */ template -inline int block_dim(value_idx ncols) { +inline int block_dim(value_idx ncols) +{ int blockdim; if (ncols <= 32) blockdim = 32; @@ -54,9 +55,9 @@ inline int block_dim(value_idx ncols) { * @return */ template -__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, - G key) { - unsigned int mask = __ballot_sync(init_mask, true); +__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G key) +{ + unsigned int mask = __ballot_sync(init_mask, true); unsigned int peer_group = 0; bool is_peer; @@ -77,12 +78,14 @@ __device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, } #endif -__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) { +__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) +{ return __ffs(peer_group) - 1; } template -__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { +__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols) +{ int row = blockIdx.x; int tid = threadIdx.x; @@ -92,15 +95,16 @@ __global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) { } template -void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols, - cudaStream_t stream) { +void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_t stream) +{ int blockdim = block_dim(ncols); iota_fill_block_kernel<<>>(indices, ncols); } template -__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) { +__device__ int get_stop_idx(T row, T m, T nnz, const T* ind) +{ int stop_idx = 0; if (row < (m - 1)) stop_idx = ind[row + 1]; diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp index 2cdf9bf4f5..e8cc85256d 100644 --- a/cpp/include/raft/spatial/knn/ann.hpp +++ b/cpp/include/raft/spatial/knn/ann.hpp @@ -42,14 +42,16 @@ namespace knn { * @param[in] D the dimensionality of the index array */ template -inline void approx_knn_build_index(raft::handle_t &handle, - raft::spatial::knn::knnIndex *index, - knnIndexParam *params, +inline void approx_knn_build_index(raft::handle_t& handle, + raft::spatial::knn::knnIndex* index, + knnIndexParam* params, raft::distance::DistanceType metric, - float metricArg, float *index_array, - value_idx n, value_idx D) { - detail::approx_knn_build_index(handle, index, params, metric, metricArg, - index_array, n, D); + float metricArg, + float* index_array, + value_idx n, + value_idx D) +{ + detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D); } /** @@ -66,12 +68,15 @@ inline void approx_knn_build_index(raft::handle_t &handle, * @param[in] n number of rows in the query array */ template -inline void approx_knn_search(raft::handle_t &handle, float *distances, - int64_t *indices, - raft::spatial::knn::knnIndex *index, value_idx k, - float *query_array, value_idx n) { - detail::approx_knn_search(handle, distances, indices, index, k, query_array, - n); +inline void approx_knn_search(raft::handle_t& handle, + float* distances, + int64_t* indices, + raft::spatial::knn::knnIndex* index, + value_idx k, + float* query_array, + value_idx n) +{ + detail::approx_knn_search(handle, distances, indices, index, k, query_array, n); } } // namespace knn diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h index 6a6c7751c2..573a23181d 100644 --- a/cpp/include/raft/spatial/knn/ann_common.h +++ b/cpp/include/raft/spatial/knn/ann_common.h @@ -26,13 +26,14 @@ namespace spatial { namespace knn { struct knnIndex { - faiss::gpu::GpuIndex *index; + faiss::gpu::GpuIndex* index; raft::distance::DistanceType metric; float metricArg; - faiss::gpu::StandardGpuResources *gpu_res; + faiss::gpu::StandardGpuResources* gpu_res; int device; - ~knnIndex() { + ~knnIndex() + { delete index; delete gpu_res; } @@ -57,7 +58,8 @@ struct IVFParam : knnIndexParam { int nprobe; }; -struct IVFFlatParam : IVFParam {}; +struct IVFFlatParam : IVFParam { +}; struct IVFPQParam : IVFParam { int M; diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp index a98473f186..cb2b9e99cd 100644 --- a/cpp/include/raft/spatial/knn/ball_cover.hpp +++ b/cpp/include/raft/spatial/knn/ball_cover.hpp @@ -28,12 +28,11 @@ namespace raft { namespace spatial { namespace knn { -template -void rbc_build_index(const raft::handle_t &handle, - BallCoverIndex &index) { - ASSERT(index.n == 2, - "Random ball cover currently only works in 2-dimensions"); +template +void rbc_build_index(const raft::handle_t& handle, + BallCoverIndex& index) +{ + ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions"); if (index.metric == raft::distance::DistanceType::Haversine) { detail::rbc_build_index(handle, index, detail::HaversineFunc()); } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || @@ -74,23 +73,23 @@ void rbc_build_index(const raft::handle_t &handle, * many datasets can still have great recall even by only * looking in the closest landmark. */ -template -void rbc_all_knn_query(const raft::handle_t &handle, - BallCoverIndex &index, - value_int k, value_idx *inds, value_t *dists, - bool perform_post_filtering = true, float weight = 1.0) { - ASSERT(index.n == 2, - "Random ball cover currently only works in 2-dimensions"); +template +void rbc_all_knn_query(const raft::handle_t& handle, + BallCoverIndex& index, + value_int k, + value_idx* inds, + value_t* dists, + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions"); if (index.metric == raft::distance::DistanceType::Haversine) { - detail::rbc_all_knn_query(handle, index, k, inds, dists, - detail::HaversineFunc(), perform_post_filtering, - weight); + detail::rbc_all_knn_query( + handle, index, k, inds, dists, detail::HaversineFunc(), perform_post_filtering, weight); } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) { - detail::rbc_all_knn_query(handle, index, k, inds, dists, - detail::EuclideanFunc(), perform_post_filtering, - weight); + detail::rbc_all_knn_query( + handle, index, k, inds, dists, detail::EuclideanFunc(), perform_post_filtering, weight); } else { RAFT_FAIL("Metric not supported"); } @@ -127,23 +126,40 @@ void rbc_all_knn_query(const raft::handle_t &handle, * looking in the closest landmark. * @param[in] n_query_pts number of query points */ -template -void rbc_knn_query(const raft::handle_t &handle, - BallCoverIndex &index, - value_int k, const value_t *query, value_int n_query_pts, - value_idx *inds, value_t *dists, - bool perform_post_filtering = true, float weight = 1.0) { - ASSERT(index.n == 2, - "Random ball cover currently only works in 2-dimensions"); +template +void rbc_knn_query(const raft::handle_t& handle, + BallCoverIndex& index, + value_int k, + const value_t* query, + value_int n_query_pts, + value_idx* inds, + value_t* dists, + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions"); if (index.metric == raft::distance::DistanceType::Haversine) { - detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists, - detail::HaversineFunc(), perform_post_filtering, + detail::rbc_knn_query(handle, + index, + k, + query, + n_query_pts, + inds, + dists, + detail::HaversineFunc(), + perform_post_filtering, weight); } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded || index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) { - detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists, - detail::EuclideanFunc(), perform_post_filtering, + detail::rbc_knn_query(handle, + index, + k, + query, + n_query_pts, + inds, + dists, + detail::EuclideanFunc(), + perform_post_filtering, weight); } else { RAFT_FAIL("Metric not supported"); diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h index ca614bb0cb..e38124edb6 100644 --- a/cpp/include/raft/spatial/knn/ball_cover_common.h +++ b/cpp/include/raft/spatial/knn/ball_cover_common.h @@ -34,12 +34,13 @@ namespace knn { * @tparam value_t * @tparam value_int */ -template +template class BallCoverIndex { public: - explicit BallCoverIndex(const raft::handle_t &handle_, const value_t *X_, - value_int m_, value_int n_, + explicit BallCoverIndex(const raft::handle_t& handle_, + const value_t* X_, + value_int m_, + value_int n_, raft::distance::DistanceType metric_) : handle(handle_), X(X_), @@ -47,37 +48,39 @@ class BallCoverIndex { n(n_), metric(metric_), /** - * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound - * - * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m) - */ + * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound + * + * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m) + */ n_landmarks(sqrt(m_)), R_indptr(sqrt(m_) + 1, handle.get_stream()), R_1nn_cols(m_, handle.get_stream()), R_1nn_dists(m_, handle.get_stream()), R(sqrt(m_) * n_, handle.get_stream()), R_radius(sqrt(m_), handle.get_stream()), - index_trained(false) {} + index_trained(false) + { + } - value_idx *get_R_indptr() { return R_indptr.data(); } - value_idx *get_R_1nn_cols() { return R_1nn_cols.data(); } - value_t *get_R_1nn_dists() { return R_1nn_dists.data(); } - value_t *get_R_radius() { return R_radius.data(); } - value_t *get_R() { return R.data(); } - const value_t *get_X() { return X; } + value_idx* get_R_indptr() { return R_indptr.data(); } + value_idx* get_R_1nn_cols() { return R_1nn_cols.data(); } + value_t* get_R_1nn_dists() { return R_1nn_dists.data(); } + value_t* get_R_radius() { return R_radius.data(); } + value_t* get_R() { return R.data(); } + const value_t* get_X() { return X; } bool is_index_trained() const { return index_trained; }; // This should only be set by internal functions void set_index_trained() { index_trained = true; } - const raft::handle_t &handle; + const raft::handle_t& handle; const value_int m; const value_int n; const value_int n_landmarks; - const value_t *X; + const value_t* X; raft::distance::DistanceType metric; diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh index 980001f166..7f4e4511d2 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh @@ -55,90 +55,84 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype( - QuantizerType qtype) { +inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype) +{ switch (qtype) { - case QuantizerType::QT_8bit: - return faiss::ScalarQuantizer::QuantizerType::QT_8bit; + case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit; case QuantizerType::QT_8bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform; case QuantizerType::QT_4bit_uniform: return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform; - case QuantizerType::QT_fp16: - return faiss::ScalarQuantizer::QuantizerType::QT_fp16; + case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16; case QuantizerType::QT_8bit_direct: return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct; - case QuantizerType::QT_6bit: - return faiss::ScalarQuantizer::QuantizerType::QT_6bit; - default: - return (faiss::ScalarQuantizer::QuantizerType)qtype; + case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit; + default: return (faiss::ScalarQuantizer::QuantizerType)qtype; } } template -void approx_knn_ivfflat_build_index(knnIndex *index, IVFParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfflat_build_index( + knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFFlatConfig config; - config.device = index->device; + config.device = index->device; faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFFlat *faiss_index = new faiss::gpu::GpuIndexIVFFlat( - index->gpu_res, D, params->nlist, faiss_metric, config); + faiss::gpu::GpuIndexIVFFlat* faiss_index = + new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfpq_build_index(knnIndex *index, IVFPQParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfpq_build_index( + knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFPQConfig config; - config.device = index->device; - config.usePrecomputedTables = params->usePrecomputedTables; - config.interleavedLayout = params->n_bits != 8; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::gpu::GpuIndexIVFPQ *faiss_index = - new faiss::gpu::GpuIndexIVFPQ(index->gpu_res, D, params->nlist, params->M, - params->n_bits, faiss_metric, config); + config.device = index->device; + config.usePrecomputedTables = params->usePrecomputedTables; + config.interleavedLayout = params->n_bits != 8; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ( + index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_ivfsq_build_index(knnIndex *index, IVFSQParam *params, - raft::distance::DistanceType metric, - IntType n, IntType D) { +void approx_knn_ivfsq_build_index( + knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D) +{ faiss::gpu::GpuIndexIVFScalarQuantizerConfig config; - config.device = index->device; - faiss::MetricType faiss_metric = build_faiss_metric(metric); - faiss::ScalarQuantizer::QuantizerType faiss_qtype = - build_faiss_qtype(params->qtype); - faiss::gpu::GpuIndexIVFScalarQuantizer *faiss_index = - new faiss::gpu::GpuIndexIVFScalarQuantizer(index->gpu_res, D, params->nlist, - faiss_qtype, faiss_metric, - params->encodeResidual); + config.device = index->device; + faiss::MetricType faiss_metric = build_faiss_metric(metric); + faiss::ScalarQuantizer::QuantizerType faiss_qtype = build_faiss_qtype(params->qtype); + faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer( + index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual); faiss_index->setNumProbes(params->nprobe); index->index = faiss_index; } template -void approx_knn_build_index(raft::handle_t &handle, - raft::spatial::knn::knnIndex *index, - raft::spatial::knn::knnIndexParam *params, +void approx_knn_build_index(raft::handle_t& handle, + raft::spatial::knn::knnIndex* index, + raft::spatial::knn::knnIndexParam* params, raft::distance::DistanceType metric, - float metricArg, float *index_array, IntType n, - IntType D) { + float metricArg, + float* index_array, + IntType n, + IntType D) +{ int device; CUDA_CHECK(cudaGetDevice(&device)); - faiss::gpu::StandardGpuResources *gpu_res = - new faiss::gpu::StandardGpuResources(); + faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources(); gpu_res->noTempMemory(); gpu_res->setDefaultStream(device, handle.get_stream()); - index->gpu_res = gpu_res; - index->device = device; - index->index = nullptr; - index->metric = metric; + index->gpu_res = gpu_res; + index->device = device; + index->index = nullptr; + index->metric = metric; index->metricArg = metricArg; // perform preprocessing @@ -148,21 +142,20 @@ void approx_knn_build_index(raft::handle_t &handle, query_metric_processor->preprocess(index_array); - if (dynamic_cast(params)) { - IVFFlatParam *IVFFlat_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFFlatParam* IVFFlat_param = dynamic_cast(params); approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D); std::vector h_index_array(n * D); - raft::update_host(h_index_array.data(), index_array, h_index_array.size(), - handle.get_stream()); + raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream()); query_metric_processor->revert(index_array); index->index->train(n, h_index_array.data()); index->index->add(n, h_index_array.data()); } else { - if (dynamic_cast(params)) { - IVFPQParam *IVFPQ_param = dynamic_cast(params); + if (dynamic_cast(params)) { + IVFPQParam* IVFPQ_param = dynamic_cast(params); approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D); - } else if (dynamic_cast(params)) { - IVFSQParam *IVFSQ_param = dynamic_cast(params); + } else if (dynamic_cast(params)) { + IVFSQParam* IVFSQ_param = dynamic_cast(params); approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D); } else { ASSERT(index->index, "KNN index could not be initialized"); @@ -175,13 +168,17 @@ void approx_knn_build_index(raft::handle_t &handle, } template -void approx_knn_search(raft::handle_t &handle, float *distances, - int64_t *indices, raft::spatial::knn::knnIndex *index, - IntType k, float *query_array, IntType n) { +void approx_knn_search(raft::handle_t& handle, + float* distances, + int64_t* indices, + raft::spatial::knn::knnIndex* index, + IntType k, + float* query_array, + IntType n) +{ // perform preprocessing std::unique_ptr> query_metric_processor = - create_processor(index->metric, n, index->index->d, k, false, - handle.get_stream()); + create_processor(index->metric, n, index->index->d, k, false, handle.get_stream()); query_metric_processor->preprocess(query_array); index->index->search(n, query_array, k, distances, indices); @@ -192,13 +189,14 @@ void approx_knn_search(raft::handle_t &handle, float *distances, index->metric == raft::distance::DistanceType::L2SqrtUnexpanded || index->metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (index->metric == raft::distance::DistanceType::LpUnexpanded) - p = 1.0 / index->metricArg; + if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg; raft::linalg::unaryOp( - distances, distances, n * k, + distances, + distances, + n * k, [p] __device__(float input) { return powf(input, p); }, handle.get_stream()); } diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh index 7354fa3497..7b54c3d25b 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh @@ -60,34 +60,43 @@ namespace detail { * @param handle * @param index */ -template -void sample_landmarks(const raft::handle_t &handle, - BallCoverIndex &index) { - rmm::device_uvector R_1nn_cols2(index.n_landmarks, - handle.get_stream()); +template +void sample_landmarks(const raft::handle_t& handle, + BallCoverIndex& index) +{ + rmm::device_uvector R_1nn_cols2(index.n_landmarks, handle.get_stream()); rmm::device_uvector R_1nn_ones(index.m, handle.get_stream()); - rmm::device_uvector R_indices(index.n_landmarks, - handle.get_stream()); + rmm::device_uvector R_indices(index.n_landmarks, handle.get_stream()); - thrust::sequence(handle.get_thrust_policy(), index.get_R_1nn_cols(), - index.get_R_1nn_cols() + index.m, (value_idx)0); + thrust::sequence(handle.get_thrust_policy(), + index.get_R_1nn_cols(), + index.get_R_1nn_cols() + index.m, + (value_idx)0); - thrust::fill(handle.get_thrust_policy(), R_1nn_ones.data(), - R_1nn_ones.data() + R_1nn_ones.size(), 1.0); + thrust::fill( + handle.get_thrust_policy(), R_1nn_ones.data(), R_1nn_ones.data() + R_1nn_ones.size(), 1.0); /** - * 1. Randomly sample sqrt(n) points from X - */ + * 1. Randomly sample sqrt(n) points from X + */ auto rng = raft::random::Rng(12345); - rng.sampleWithoutReplacement(handle, R_indices.data(), R_1nn_cols2.data(), - index.get_R_1nn_cols(), R_1nn_ones.data(), - (value_idx)index.n_landmarks, (value_idx)index.m, + rng.sampleWithoutReplacement(handle, + R_indices.data(), + R_1nn_cols2.data(), + index.get_R_1nn_cols(), + R_1nn_ones.data(), + (value_idx)index.n_landmarks, + (value_idx)index.m, handle.get_stream()); - raft::matrix::copyRows( - index.get_X(), index.m, index.n, index.get_R(), R_1nn_cols2.data(), - index.n_landmarks, handle.get_stream(), true); + raft::matrix::copyRows(index.get_X(), + index.m, + index.n, + index.get_R(), + R_1nn_cols2.data(), + index.n_landmarks, + handle.get_stream(), + true); } /** @@ -100,35 +109,34 @@ void sample_landmarks(const raft::handle_t &handle, * @param k * @param index */ -template -void construct_landmark_1nn( - const raft::handle_t &handle, const value_idx *R_knn_inds_ptr, - const value_t *R_knn_dists_ptr, value_int k, - BallCoverIndex &index) { +template +void construct_landmark_1nn(const raft::handle_t& handle, + const value_idx* R_knn_inds_ptr, + const value_t* R_knn_dists_ptr, + value_int k, + BallCoverIndex& index) +{ rmm::device_uvector R_1nn_inds(index.m, handle.get_stream()); - value_idx *R_1nn_inds_ptr = R_1nn_inds.data(); - value_t *R_1nn_dists_ptr = index.get_R_1nn_dists(); + value_idx* R_1nn_inds_ptr = R_1nn_inds.data(); + value_t* R_1nn_dists_ptr = index.get_R_1nn_dists(); auto idxs = thrust::make_counting_iterator(0); - thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m, - [=] __device__(value_idx i) { - R_1nn_inds_ptr[i] = R_knn_inds_ptr[i * k]; - R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k]; - }); + thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m, [=] __device__(value_idx i) { + R_1nn_inds_ptr[i] = R_knn_inds_ptr[i * k]; + R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k]; + }); - auto keys = thrust::make_zip_iterator( - thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists())); + auto keys = + thrust::make_zip_iterator(thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists())); // group neighborhoods for each reference landmark and sort each group by distance - thrust::sort_by_key(handle.get_thrust_policy(), keys, keys + index.m, - index.get_R_1nn_cols(), NNComp()); + thrust::sort_by_key( + handle.get_thrust_policy(), keys, keys + index.m, index.get_R_1nn_cols(), NNComp()); // convert to CSR for fast lookup raft::sparse::convert::sorted_coo_to_csr( - R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1, - handle.get_stream()); + R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1, handle.get_stream()); } /** @@ -144,20 +152,33 @@ void construct_landmark_1nn( * @param R_knn_inds * @param R_knn_dists */ -template -void k_closest_landmarks(const raft::handle_t &handle, - BallCoverIndex &index, - const value_t *query_pts, value_int n_query_pts, - value_int k, value_idx *R_knn_inds, - value_t *R_knn_dists) { - std::vector input = {index.get_R()}; +template +void k_closest_landmarks(const raft::handle_t& handle, + BallCoverIndex& index, + const value_t* query_pts, + value_int n_query_pts, + value_int k, + value_idx* R_knn_inds, + value_t* R_knn_dists) +{ + std::vector input = {index.get_R()}; std::vector sizes = {index.n_landmarks}; - brute_force_knn_impl( - input, sizes, index.n, const_cast(query_pts), n_query_pts, - R_knn_inds, R_knn_dists, k, handle.get_stream(), nullptr, 0, true, true, - nullptr, index.metric); + brute_force_knn_impl(input, + sizes, + index.n, + const_cast(query_pts), + n_query_pts, + R_knn_inds, + R_knn_dists, + k, + handle.get_stream(), + nullptr, + 0, + true, + true, + nullptr, + index.metric); } /** @@ -168,21 +189,21 @@ void k_closest_landmarks(const raft::handle_t &handle, * @param handle * @param index */ -template -void compute_landmark_radii( - const raft::handle_t &handle, - BallCoverIndex &index) { +template +void compute_landmark_radii(const raft::handle_t& handle, + BallCoverIndex& index) +{ auto entries = thrust::make_counting_iterator(0); - const value_idx *R_indptr_ptr = index.get_R_indptr(); - const value_t *R_1nn_dists_ptr = index.get_R_1nn_dists(); - value_t *R_radius_ptr = index.get_R_radius(); - thrust::for_each(handle.get_thrust_policy(), entries, + const value_idx* R_indptr_ptr = index.get_R_indptr(); + const value_t* R_1nn_dists_ptr = index.get_R_1nn_dists(); + value_t* R_radius_ptr = index.get_R_radius(); + thrust::for_each(handle.get_thrust_policy(), + entries, entries + index.n_landmarks, [=] __device__(value_idx input) { value_idx last_row_idx = R_indptr_ptr[input + 1] - 1; - R_radius_ptr[input] = R_1nn_dists_ptr[last_row_idx]; + R_radius_ptr[input] = R_1nn_dists_ptr[last_row_idx]; }); } @@ -196,23 +217,51 @@ void compute_landmark_radii( * marking the distance to be computed between x, y only * if knn[k].distance >= d(x_i, R_k) + d(R_k, y) */ -template -void perform_rbc_query(const raft::handle_t &handle, - BallCoverIndex &index, - const value_t *query, value_int n_query_pts, - std::uint32_t k, const value_idx *R_knn_inds, - const value_t *R_knn_dists, dist_func dfunc, - value_idx *inds, value_t *dists, - value_int *dists_counter, value_int *post_dists_counter, - float weight = 1.0, bool perform_post_filtering = true) { +template +void perform_rbc_query(const raft::handle_t& handle, + BallCoverIndex& index, + const value_t* query, + value_int n_query_pts, + std::uint32_t k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func dfunc, + value_idx* inds, + value_t* dists, + value_int* dists_counter, + value_int* post_dists_counter, + float weight = 1.0, + bool perform_post_filtering = true) +{ // Compute nearest k for each neighborhood in each closest R - rbc_low_dim_pass_one(handle, index, query, n_query_pts, k, R_knn_inds, - R_knn_dists, dfunc, inds, dists, weight, dists_counter); + rbc_low_dim_pass_one(handle, + index, + query, + n_query_pts, + k, + R_knn_inds, + R_knn_dists, + dfunc, + inds, + dists, + weight, + dists_counter); if (perform_post_filtering) { - rbc_low_dim_pass_two(handle, index, query, n_query_pts, k, R_knn_inds, - R_knn_dists, dfunc, inds, dists, weight, + rbc_low_dim_pass_two(handle, + index, + query, + n_query_pts, + k, + R_knn_inds, + R_knn_dists, + dfunc, + inds, + dists, + weight, post_dists_counter); } } @@ -228,13 +277,15 @@ void perform_rbc_query(const raft::handle_t &handle, * query which is useful for algorithms that need to perform * A * A.T. */ -template -void rbc_build_index(const raft::handle_t &handle, - BallCoverIndex &index, - distance_func dfunc) { - ASSERT(index.n == 2, - "only 2d vectors are supported in current implementation"); +template +void rbc_build_index(const raft::handle_t& handle, + BallCoverIndex& index, + distance_func dfunc) +{ + ASSERT(index.n == 2, "only 2d vectors are supported in current implementation"); ASSERT(!index.is_index_trained(), "index cannot be previously trained"); rmm::device_uvector R_knn_inds(index.m, handle.get_stream()); @@ -249,8 +300,8 @@ void rbc_build_index(const raft::handle_t &handle, * 2. Perform knn = bfknn(X, R, k) */ value_int k = 1; - k_closest_landmarks(handle, index, index.get_X(), index.m, k, - R_knn_inds.data(), R_knn_dists.data()); + k_closest_landmarks( + handle, index, index.get_X(), index.m, k, R_knn_inds.data(), R_knn_dists.data()); /** * 3. Create L_r = knn[:,0].T (CSR) @@ -258,8 +309,7 @@ void rbc_build_index(const raft::handle_t &handle, * Slice closest neighboring R * Secondary sort by (R_knn_inds, R_knn_dists) */ - construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, - index); + construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index); /** * Compute radius of each R for filtering: p(q, r) <= p(q, q_r) + radius(r) @@ -271,16 +321,21 @@ void rbc_build_index(const raft::handle_t &handle, /** * Performs an all neighbors knn query (e.g. index == query) */ -template -void rbc_all_knn_query(const raft::handle_t &handle, - BallCoverIndex &index, - value_int k, value_idx *inds, value_t *dists, +template +void rbc_all_knn_query(const raft::handle_t& handle, + BallCoverIndex& index, + value_int k, + value_idx* inds, + value_t* dists, distance_func dfunc, // approximate nn options - bool perform_post_filtering = true, float weight = 1.0) { - ASSERT(index.n == 2, - "only 2d vectors are supported in current implementation"); + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n == 2, "only 2d vectors are supported in current implementation"); ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k"); ASSERT(!index.is_index_trained(), "index cannot be previously trained"); @@ -289,22 +344,30 @@ void rbc_all_knn_query(const raft::handle_t &handle, // For debugging / verification. Remove before releasing rmm::device_uvector dists_counter(index.m, handle.get_stream()); - rmm::device_uvector post_dists_counter(index.m, - handle.get_stream()); + rmm::device_uvector post_dists_counter(index.m, handle.get_stream()); sample_landmarks(handle, index); - k_closest_landmarks(handle, index, index.get_X(), index.m, k, - R_knn_inds.data(), R_knn_dists.data()); + k_closest_landmarks( + handle, index, index.get_X(), index.m, k, R_knn_inds.data(), R_knn_dists.data()); - construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, - index); + construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index); compute_landmark_radii(handle, index); - perform_rbc_query(handle, index, index.get_X(), index.m, k, R_knn_inds.data(), - R_knn_dists.data(), dfunc, inds, dists, - dists_counter.data(), post_dists_counter.data(), weight, + perform_rbc_query(handle, + index, + index.get_X(), + index.m, + k, + R_knn_inds.data(), + R_knn_dists.data(), + dfunc, + inds, + dists, + dists_counter.data(), + post_dists_counter.data(), + weight, perform_post_filtering); } @@ -312,35 +375,50 @@ void rbc_all_knn_query(const raft::handle_t &handle, * Performs a knn query against an index. This assumes the index has * already been built. */ -template -void rbc_knn_query(const raft::handle_t &handle, - BallCoverIndex &index, - value_int k, const value_t *query, value_int n_query_pts, - value_idx *inds, value_t *dists, distance_func dfunc, +template +void rbc_knn_query(const raft::handle_t& handle, + BallCoverIndex& index, + value_int k, + const value_t* query, + value_int n_query_pts, + value_idx* inds, + value_t* dists, + distance_func dfunc, // approximate nn options - bool perform_post_filtering = true, float weight = 1.0) { - ASSERT(index.n == 2, - "only 2d vectors are supported in current implementation"); + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n == 2, "only 2d vectors are supported in current implementation"); ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k"); ASSERT(index.is_index_trained(), "index must be previously trained"); rmm::device_uvector R_knn_inds(k * index.m, handle.get_stream()); rmm::device_uvector R_knn_dists(k * index.m, handle.get_stream()); - k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(), - R_knn_dists.data()); + k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(), R_knn_dists.data()); // For debugging / verification. Remove before releasing rmm::device_uvector dists_counter(index.m, handle.get_stream()); - rmm::device_uvector post_dists_counter(index.m, - handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), post_dists_counter.data(), - post_dists_counter.data() + index.m, 0); - - perform_rbc_query(handle, index, query, n_query_pts, k, R_knn_inds.data(), - R_knn_dists.data(), dfunc, inds, dists, - dists_counter.data(), post_dists_counter.data(), weight, + rmm::device_uvector post_dists_counter(index.m, handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), post_dists_counter.data(), post_dists_counter.data() + index.m, 0); + + perform_rbc_query(handle, + index, + query, + n_query_pts, + k, + R_knn_inds.data(), + R_knn_dists.data(), + dfunc, + inds, + dists, + dists_counter.data(), + post_dists_counter.data(), + weight, perform_post_filtering); } diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh index c6cb679408..181dad1a90 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh @@ -27,7 +27,8 @@ namespace detail { struct NNComp { template - __host__ __device__ bool operator()(const one &t1, const two &t2) { + __host__ __device__ bool operator()(const one& t1, const two& t2) + { // sort first by each sample's reference landmark, if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; @@ -39,17 +40,20 @@ struct NNComp { struct HaversineFunc { template - __device__ __host__ __forceinline__ value_t - operator()(const value_t *a, const value_t *b, const value_int n_dims) { - return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], - b[1]); + __device__ __host__ __forceinline__ value_t operator()(const value_t* a, + const value_t* b, + const value_int n_dims) + { + return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], b[1]); } }; struct EuclideanFunc { template - __device__ __host__ __forceinline__ value_t - operator()(const value_t *a, const value_t *b, const value_int n_dims) { + __device__ __host__ __forceinline__ value_t operator()(const value_t* a, + const value_t* b, + const value_int n_dims) + { value_t sum_sq = 0; for (value_int i = 0; i < n_dims; ++i) { value_t diff = a[i] - b[i]; @@ -63,7 +67,8 @@ struct EuclideanFunc { /** * Zeros the bit at location h in a one-hot encoded 32-bit int array */ -__device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) { +__device__ inline void _zero_bit(std::uint32_t* arr, std::uint32_t h) +{ int bit = h % 32; int idx = h / 32; @@ -71,7 +76,7 @@ __device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) { std::uint32_t old = arr[idx]; do { assumed = old; - old = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit)); + old = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit)); } while (assumed != old); } @@ -79,7 +84,8 @@ __device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) { * Returns whether or not bit at location h is nonzero in a one-hot * encoded 32-bit in array. */ -__device__ inline bool _get_val(std::uint32_t *arr, std::uint32_t h) { +__device__ inline bool _get_val(std::uint32_t* arr, std::uint32_t h) +{ int bit = h % 32; int idx = h / 32; return (arr[idx] & (1 << bit)) > 0; diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh index 4a476274dd..5d28258f7a 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh @@ -58,14 +58,24 @@ namespace detail { * @param output * @param weight */ -template -__global__ void perform_post_filter_registers( - const value_t *X, value_int n_cols, const value_idx *R_knn_inds, - const value_t *R_knn_dists, const value_t *R_radius, const value_t *landmarks, - int n_landmarks, value_int bitset_size, value_int k, distance_func dfunc, - std::uint32_t *output, float weight = 1.0) { +__global__ void perform_post_filter_registers(const value_t* X, + value_int n_cols, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + const value_t* R_radius, + const value_t* landmarks, + int n_landmarks, + value_int bitset_size, + value_int k, + distance_func dfunc, + std::uint32_t* output, + float weight = 1.0) +{ // allocate array of size n_landmarks / 32 ints extern __shared__ std::uint32_t shared_mem[]; @@ -98,8 +108,7 @@ __global__ void perform_post_filter_registers( for (value_int l = threadIdx.x; l < n_landmarks; l += tpb) { // compute p(q, r) value_t dist = dfunc(local_x_ptr, landmarks + (n_cols * l), n_cols); - if (dist > weight * (closest_R_dist + R_radius[l]) || - dist > 3 * closest_R_dist) { + if (dist > weight * (closest_R_dist + R_radius[l]) || dist > 3 * closest_R_dist) { _zero_bit(shared_mem, l); } } @@ -136,38 +145,58 @@ __global__ void perform_post_filter_registers( * @param k * @param dist_counter */ -template -__global__ void compute_final_dists_registers( - const value_t *X_index, const value_t *X, const value_int n_cols, - bitset_type *bitset, value_int bitset_size, const value_t *R_knn_dists, - const value_idx *R_indptr, const value_idx *R_1nn_inds, - const value_t *R_1nn_dists, value_idx *knn_inds, value_t *knn_dists, - value_int n_landmarks, value_int k, dist_func dfunc, - value_int *dist_counter) { +template +__global__ void compute_final_dists_registers(const value_t* X_index, + const value_t* X, + const value_int n_cols, + bitset_type* bitset, + value_int bitset_size, + const value_t* R_knn_dists, + const value_idx* R_indptr, + const value_idx* R_1nn_inds, + const value_t* R_1nn_dists, + value_idx* knn_inds, + value_t* knn_dists, + value_int n_landmarks, + value_int k, + dist_func dfunc, + value_int* dist_counter) +{ static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t shared_memK[kNumWarps * warp_q]; - __shared__ faiss::gpu::KeyValuePair - shared_memV[kNumWarps * warp_q]; + __shared__ faiss::gpu::KeyValuePair shared_memV[kNumWarps * warp_q]; - const value_t *x_ptr = X + (n_cols * blockIdx.x); + const value_t* x_ptr = X + (n_cols * blockIdx.x); value_t local_x_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_x_ptr[j] = x_ptr[j]; } - faiss::gpu::KeyValueBlockSelect, warp_q, - thread_q, tpb> + faiss::gpu::KeyValueBlockSelect, + warp_q, + thread_q, + tpb> heap(faiss::gpu::Limits::getMax(), - faiss::gpu::Limits::getMax(), -1, shared_memK, shared_memV, + faiss::gpu::Limits::getMax(), + -1, + shared_memK, + shared_memV, k); const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize); - value_int i = threadIdx.x; + value_int i = threadIdx.x; for (; i < n_k; i += tpb) { value_idx ind = knn_inds[blockIdx.x * k + i]; heap.add(knn_dists[blockIdx.x * k + i], R_knn_dists[ind * k], ind); @@ -185,33 +214,31 @@ __global__ void compute_final_dists_registers( // candidate if (_get_val(bitset + (blockIdx.x * bitset_size), cur_R_ind)) { value_idx R_start_offset = R_indptr[cur_R_ind]; - value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; - value_idx R_size = R_stop_offset - R_start_offset; + value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; + value_idx R_size = R_stop_offset - R_start_offset; // Loop through R's neighborhood in parallel // Round R_size to the nearest warp threads so they can // all be computing in parallel. - const value_int limit = - faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); + const value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); i = threadIdx.x; for (; i < limit; i += tpb) { value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i]; - value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; - value_t z = heap.warpKTopRDist == 0.00 - ? 0.0 - : (abs(heap.warpKTop - heap.warpKTopRDist) * - abs(heap.warpKTopRDist - cur_candidate_dist) - - heap.warpKTop * cur_candidate_dist) / - heap.warpKTopRDist; - z = isnan(z) ? 0.0 : z; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + value_t z = heap.warpKTopRDist == 0.00 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + z = isnan(z) ? 0.0 : z; // If lower bound on distance could possibly be in // the closest k neighbors, compute it and add to k-select value_t dist = std::numeric_limits::max(); if (z <= heap.warpKTop) { - const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind); + const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind); value_t local_y_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_y_ptr[j] = y_ptr[j]; @@ -226,21 +253,20 @@ __global__ void compute_final_dists_registers( // second round guarantees to be only a single warp. if (i < R_size) { value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i]; - value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; - value_t z = heap.warpKTopRDist == 0.00 - ? 0.0 - : (abs(heap.warpKTop - heap.warpKTopRDist) * - abs(heap.warpKTopRDist - cur_candidate_dist) - - heap.warpKTop * cur_candidate_dist) / - heap.warpKTopRDist; + value_t z = heap.warpKTopRDist == 0.00 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; z = isnan(z) ? 0.0 : z; // If lower bound on distance could possibly be in // the closest k neighbors, compute it and add to k-select value_t dist = std::numeric_limits::max(); if (z <= heap.warpKTop) { - const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind); + const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind); value_t local_y_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_y_ptr[j] = y_ptr[j]; @@ -257,7 +283,7 @@ __global__ void compute_final_dists_registers( for (value_int i = threadIdx.x; i < k; i += tpb) { knn_dists[blockIdx.x * k + i] = shared_memK[i]; - knn_inds[blockIdx.x * k + i] = shared_memV[i].value; + knn_inds[blockIdx.x * k + i] = shared_memV[i].value; } } @@ -278,28 +304,41 @@ __global__ void compute_final_dists_registers( * @param R_1nn_cols * @param R_1nn_dists */ -template -__global__ void block_rbc_kernel_registers( - const value_t *X_index, const value_t *X, - value_int n_cols, // n_cols should be 2 or 3 dims - const value_idx *R_knn_inds, const value_t *R_knn_dists, value_int m, - value_int k, const value_idx *R_indptr, const value_idx *R_1nn_cols, - const value_t *R_1nn_dists, value_idx *out_inds, value_t *out_dists, - value_int *dist_counter, value_t *R_radius, distance_func dfunc, - float weight = 1.0) { +template +__global__ void block_rbc_kernel_registers(const value_t* X_index, + const value_t* X, + value_int n_cols, // n_cols should be 2 or 3 dims + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + value_int m, + value_int k, + const value_idx* R_indptr, + const value_idx* R_1nn_cols, + const value_t* R_1nn_dists, + value_idx* out_inds, + value_t* out_dists, + value_int* dist_counter, + value_t* R_radius, + distance_func dfunc, + float weight = 1.0) +{ static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t shared_memK[kNumWarps * warp_q]; - __shared__ faiss::gpu::KeyValuePair - shared_memV[kNumWarps * warp_q]; + __shared__ faiss::gpu::KeyValuePair shared_memV[kNumWarps * warp_q]; // TODO: Separate kernels for different widths: // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x" // 2. Can fit comfortably in shared memory (32 to a few thousand?) // 3. Load each time individually. - const value_t *x_ptr = X + (n_cols * blockIdx.x); + const value_t* x_ptr = X + (n_cols * blockIdx.x); // Use registers only for 2d or 3d value_t local_x_ptr[col_q]; @@ -308,11 +347,18 @@ __global__ void block_rbc_kernel_registers( } // Each warp works on 1 R - faiss::gpu::KeyValueBlockSelect, warp_q, - thread_q, tpb> + faiss::gpu::KeyValueBlockSelect, + warp_q, + thread_q, + tpb> heap(faiss::gpu::Limits::getMax(), - faiss::gpu::Limits::getMax(), -1, shared_memK, shared_memV, + faiss::gpu::Limits::getMax(), + -1, + shared_memK, + shared_memV, k); value_t min_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)]; @@ -327,7 +373,7 @@ __global__ void block_rbc_kernel_registers( // determining if the distance could even potentially be in the heap. for (value_int cur_k = 0; cur_k < k; ++cur_k) { // index and distance to current blockIdx.x's closest landmark - value_t cur_R_dist = R_knn_dists[blockIdx.x * k + cur_k]; + value_t cur_R_dist = R_knn_dists[blockIdx.x * k + cur_k]; value_idx cur_R_ind = R_knn_inds[blockIdx.x * k + cur_k]; // Equation (2) in Cayton's paper- prune out R's which are > 3 * p(q, r_q) @@ -336,38 +382,37 @@ __global__ void block_rbc_kernel_registers( // The whole warp should iterate through the elements in the current R value_idx R_start_offset = R_indptr[cur_R_ind]; - value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; + value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; value_idx R_size = R_stop_offset - R_start_offset; - value_int limit = - faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); - value_int i = threadIdx.x; + value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize); + value_int i = threadIdx.x; for (; i < limit; i += tpb) { // Index and distance of current candidate's nearest landmark value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i]; - value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; // Take 2 landmarks l_1 and l_2 where l_1 is the furthest point in the heap // and l_2 is the current landmark R. s is the current data point and // t is the new candidate data point. We know that: - // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) - d(l_2, t) | - d(s, l_1) * d(l_2, t) + // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) - + // d(l_2, t) | - d(s, l_1) * d(l_2, t) - // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to the candidate point - // cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s, l_1) then we should compute the - // distance because it's possible it could be smaller. + // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to + // the candidate point cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s, + // l_1) then we should compute the distance because it's possible it could be smaller. // - value_t z = heap.warpKTopRDist == 0.00 - ? 0.0 - : (abs(heap.warpKTop - heap.warpKTopRDist) * - abs(heap.warpKTopRDist - cur_candidate_dist) - - heap.warpKTop * cur_candidate_dist) / - heap.warpKTopRDist; - - z = isnan(z) ? 0.0 : z; + value_t z = heap.warpKTopRDist == 0.00 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + + z = isnan(z) ? 0.0 : z; value_t dist = std::numeric_limits::max(); if (i < k || z <= heap.warpKTop) { - const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind); + const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind); value_t local_y_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_y_ptr[j] = y_ptr[j]; @@ -381,18 +426,17 @@ __global__ void block_rbc_kernel_registers( if (i < R_size) { value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i]; - value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; - value_t z = heap.warpKTopRDist == 0.0 - ? 0.0 - : (abs(heap.warpKTop - heap.warpKTopRDist) * - abs(heap.warpKTopRDist - cur_candidate_dist) - - heap.warpKTop * cur_candidate_dist) / - heap.warpKTopRDist; - - z = isnan(z) ? 0.0 : z; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + value_t z = heap.warpKTopRDist == 0.0 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + + z = isnan(z) ? 0.0 : z; value_t dist = std::numeric_limits::max(); if (i < k || z <= heap.warpKTop) { - const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind); + const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind); value_t local_y_ptr[col_q]; for (value_int j = 0; j < n_cols; ++j) { local_y_ptr[j] = y_ptr[j]; @@ -411,124 +455,327 @@ __global__ void block_rbc_kernel_registers( for (int i = threadIdx.x; i < k; i += tpb) { out_dists[blockIdx.x * k + i] = shared_memK[i]; - out_inds[blockIdx.x * k + i] = shared_memV[i].value; + out_inds[blockIdx.x * k + i] = shared_memV[i].value; } } -template -void rbc_low_dim_pass_one(const raft::handle_t &handle, - BallCoverIndex &index, - const value_t *query, const value_int n_query_rows, - value_int k, const value_idx *R_knn_inds, - const value_t *R_knn_dists, dist_func dfunc, - value_idx *inds, value_t *dists, float weight, - value_int *dists_counter) { +template +void rbc_low_dim_pass_one(const raft::handle_t& handle, + BallCoverIndex& index, + const value_t* query, + const value_int n_query_rows, + value_int k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func dfunc, + value_idx* inds, + value_t* dists, + float weight, + value_int* dists_counter) +{ if (k <= 32) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 64) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 128) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 256) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 512) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); else if (k <= 1024) block_rbc_kernel_registers - <<>>( - index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, dists_counter, index.get_R_radius(), dfunc, weight); + <<>>(index.get_X(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + dists_counter, + index.get_R_radius(), + dfunc, + weight); } -template -void rbc_low_dim_pass_two(const raft::handle_t &handle, - BallCoverIndex &index, - const value_t *query, const value_int n_query_rows, - value_int k, const value_idx *R_knn_inds, - const value_t *R_knn_dists, dist_func dfunc, - value_idx *inds, value_t *dists, float weight, - value_int *post_dists_counter) { +template +void rbc_low_dim_pass_two(const raft::handle_t& handle, + BallCoverIndex& index, + const value_t* query, + const value_int n_query_rows, + value_int k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func dfunc, + value_idx* inds, + value_t* dists, + float weight, + value_int* post_dists_counter) +{ const value_int bitset_size = ceil(index.n_landmarks / 32.0); - rmm::device_uvector bitset(bitset_size * index.m, - handle.get_stream()); + rmm::device_uvector bitset(bitset_size * index.m, handle.get_stream()); perform_post_filter_registers - <<>>(index.get_X(), index.n, R_knn_inds, R_knn_dists, - index.get_R_radius(), index.get_R(), - index.n_landmarks, bitset_size, k, dfunc, - bitset.data(), weight); + <<>>( + index.get_X(), + index.n, + R_knn_inds, + R_knn_dists, + index.get_R_radius(), + index.get_R(), + index.n_landmarks, + bitset_size, + k, + dfunc, + bitset.data(), + weight); if (k <= 32) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 64) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 128) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 256) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 512) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); else if (k <= 1024) - compute_final_dists_registers - <<>>( - index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists, - index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(), - inds, dists, index.n_landmarks, k, dfunc, post_dists_counter); + compute_final_dists_registers + <<>>(index.get_X(), + query, + index.n, + bitset.data(), + bitset_size, + R_knn_dists, + index.get_R_indptr(), + index.get_R_1nn_cols(), + index.get_R_1nn_dists(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); } }; // namespace detail diff --git a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh index d2f7bc2210..a53a5b03e6 100644 --- a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh @@ -25,15 +25,19 @@ namespace gpu { // `Dir` true, produce largest values. // `Dir` false, produce smallest values. -template +template struct KeyValueBlockSelect { - static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; + static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; static constexpr int kTotalWarpSortSize = NumWarpQ; - __device__ inline KeyValueBlockSelect(K initKVal, K initVKey, V initVVal, - K* smemK, KeyValuePair* smemV, - int k) + __device__ inline KeyValueBlockSelect( + K initKVal, K initVKey, V initVVal, K* smemK, KeyValuePair* smemV, int k) : initK(initKVal), initVk(initVKey), initVv(initVVal), @@ -42,53 +46,55 @@ struct KeyValueBlockSelect { warpKTopRDist(initKVal), sharedK(smemK), sharedV(smemV), - kMinus1(k - 1) { - static_assert(utils::isPowerOf2(ThreadsPerBlock), - "threads must be a power-of-2"); + kMinus1(k - 1) + { + static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); // Fill the per-thread queue keys with the default value #pragma unroll for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initVk; + threadK[i] = initK; + threadV[i].key = initVk; threadV[i].value = initVv; } int laneId = getLaneId(); int warpId = threadIdx.x / kWarpSize; - warpK = sharedK + warpId * kTotalWarpSortSize; - warpV = sharedV + warpId * kTotalWarpSortSize; + warpK = sharedK + warpId * kTotalWarpSortSize; + warpV = sharedV + warpId * kTotalWarpSortSize; // Fill warp queue (only the actual queue space is fine, not where // we write the per-thread queues for merging) for (int i = laneId; i < NumWarpQ; i += kWarpSize) { - warpK[i] = initK; - warpV[i].key = initVk; + warpK[i] = initK; + warpV[i].key = initVk; warpV[i].value = initVv; } warpFence(); } - __device__ inline void addThreadQ(K k, K vk, V vv) { + __device__ inline void addThreadQ(K k, K vk, V vv) + { if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { // Rotate right #pragma unroll for (int i = NumThreadQ - 1; i > 0; --i) { - threadK[i] = threadK[i - 1]; - threadV[i].key = threadV[i - 1].key; + threadK[i] = threadK[i - 1]; + threadV[i].key = threadV[i - 1].key; threadV[i].value = threadV[i - 1].value; } - threadK[0] = k; - threadV[0].key = vk; + threadK[0] = k; + threadV[0].key = vk; threadV[0].value = vv; ++numVals; } } - __device__ inline void checkThreadQ() { + __device__ inline void checkThreadQ() + { bool needSort = (numVals == NumThreadQ); #if CUDA_VERSION >= 9000 @@ -111,13 +117,13 @@ struct KeyValueBlockSelect { #pragma unroll for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initVk; + threadK[i] = initK; + threadV[i].key = initVk; threadV[i].value = initVv; } // We have to beat at least this element - warpKTop = warpK[kMinus1]; + warpKTop = warpK[kMinus1]; warpKTopRDist = warpV[kMinus1].key; warpFence(); @@ -126,7 +132,8 @@ struct KeyValueBlockSelect { /// This function handles sorting and merging together the /// per-thread queues with the warp-wide queue, creating a sorted /// list across both - __device__ inline void mergeWarpQ() { + __device__ inline void mergeWarpQ() + { int laneId = getLaneId(); // Sort all of the per-thread queues @@ -138,8 +145,8 @@ struct KeyValueBlockSelect { #pragma unroll for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpKRegisters[i] = warpK[i * kWarpSize + laneId]; - warpVRegisters[i].key = warpV[i * kWarpSize + laneId].key; + warpKRegisters[i] = warpK[i * kWarpSize + laneId]; + warpVRegisters[i].key = warpV[i * kWarpSize + laneId].key; warpVRegisters[i].value = warpV[i * kWarpSize + laneId].value; } @@ -148,15 +155,14 @@ struct KeyValueBlockSelect { // The warp queue is already sorted, and now that we've sorted the // per-thread queue, merge both sorted lists together, producing // one sorted list - warpMergeAnyRegistersKVP(warpKRegisters, warpVRegisters, threadK, - threadV); + warpMergeAnyRegistersKVP( + warpKRegisters, warpVRegisters, threadK, threadV); // Write back out the warp queue #pragma unroll for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpK[i * kWarpSize + laneId] = warpKRegisters[i]; - warpV[i * kWarpSize + laneId].key = warpVRegisters[i].key; + warpK[i * kWarpSize + laneId] = warpKRegisters[i]; + warpV[i * kWarpSize + laneId].key = warpVRegisters[i].key; warpV[i * kWarpSize + laneId].value = warpVRegisters[i].value; } @@ -165,12 +171,14 @@ struct KeyValueBlockSelect { /// WARNING: all threads in a warp must participate in this. /// Otherwise, you must call the constituent parts separately. - __device__ inline void add(K k, K vk, V vv) { + __device__ inline void add(K k, K vk, V vv) + { addThreadQ(k, vk, vv); checkThreadQ(); } - __device__ inline void reduce() { + __device__ inline void reduce() + { // Have all warps dump and merge their queues; this will produce // the final per-warp results mergeWarpQ(); @@ -182,8 +190,8 @@ struct KeyValueBlockSelect { // All warp queues are contiguous in smem. // Now, we have kNumWarps lists of NumWarpQ elements. // This is a power of 2. - FinalBlockMerge, NumWarpQ, - Dir, Comp>::merge(sharedK, sharedV); + FinalBlockMerge, NumWarpQ, Dir, Comp>::merge( + sharedK, sharedV); // The block-wide merge has a trailing syncthreads } diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h index 0c0398a336..5618186dfc 100644 --- a/cpp/include/raft/spatial/knn/detail/common_faiss.h +++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h @@ -27,37 +27,26 @@ namespace spatial { namespace knn { namespace detail { -inline faiss::MetricType build_faiss_metric( - raft::distance::DistanceType metric) { +inline faiss::MetricType build_faiss_metric(raft::distance::DistanceType metric) +{ switch (metric) { case raft::distance::DistanceType::CosineExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; case raft::distance::DistanceType::CorrelationExpanded: return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::L2Expanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2Unexpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtExpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L2SqrtUnexpanded: - return faiss::MetricType::METRIC_L2; - case raft::distance::DistanceType::L1: - return faiss::MetricType::METRIC_L1; - case raft::distance::DistanceType::InnerProduct: - return faiss::MetricType::METRIC_INNER_PRODUCT; - case raft::distance::DistanceType::LpUnexpanded: - return faiss::MetricType::METRIC_Lp; - case raft::distance::DistanceType::Linf: - return faiss::MetricType::METRIC_Linf; - case raft::distance::DistanceType::Canberra: - return faiss::MetricType::METRIC_Canberra; - case raft::distance::DistanceType::BrayCurtis: - return faiss::MetricType::METRIC_BrayCurtis; + case raft::distance::DistanceType::L2Expanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtExpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L2SqrtUnexpanded: return faiss::MetricType::METRIC_L2; + case raft::distance::DistanceType::L1: return faiss::MetricType::METRIC_L1; + case raft::distance::DistanceType::InnerProduct: return faiss::MetricType::METRIC_INNER_PRODUCT; + case raft::distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp; + case raft::distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf; + case raft::distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra; + case raft::distance::DistanceType::BrayCurtis: return faiss::MetricType::METRIC_BrayCurtis; case raft::distance::DistanceType::JensenShannon: return faiss::MetricType::METRIC_JensenShannon; - default: - THROW("MetricType not supported: %d", metric); + default: THROW("MetricType not supported: %d", metric); } } diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh index f774d9d1ea..47fc62066d 100644 --- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh +++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh @@ -29,19 +29,21 @@ namespace knn { namespace detail { template -DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m, - const unsigned int numOfNN) { +DI void loadAllWarpQShmem(myWarpSelect& heapArr, + Pair* shDumpKV, + const IdxT m, + const unsigned int numOfNN) +{ const int lid = raft::laneId(); #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (rowId < m) { #pragma unroll for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) { const int idx = j * warpSize + lid; if (idx < numOfNN) { - Pair KVPair = shDumpKV[rowId * numOfNN + idx]; + Pair KVPair = shDumpKV[rowId * numOfNN + idx]; heapArr[i]->warpV[j] = KVPair.key; heapArr[i]->warpK[j] = KVPair.value; } @@ -51,14 +53,17 @@ DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m, } template -DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId, - const unsigned int numOfNN) { +DI void loadWarpQShmem(myWarpSelect& heapArr, + Pair* shDumpKV, + const int rowId, + const unsigned int numOfNN) +{ const int lid = raft::laneId(); #pragma unroll for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) { const int idx = j * warpSize + lid; if (idx < numOfNN) { - Pair KVPair = shDumpKV[rowId * numOfNN + idx]; + Pair KVPair = shDumpKV[rowId * numOfNN + idx]; heapArr->warpV[j] = KVPair.key; heapArr->warpK[j] = KVPair.value; } @@ -66,25 +71,31 @@ DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId, } template -DI void storeWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT rowId, - const unsigned int numOfNN) { +DI void storeWarpQShmem(myWarpSelect& heapArr, + Pair* shDumpKV, + const IdxT rowId, + const unsigned int numOfNN) +{ const int lid = raft::laneId(); #pragma unroll for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) { const int idx = j * warpSize + lid; if (idx < numOfNN) { - Pair otherKV = Pair(heapArr->warpV[j], heapArr->warpK[j]); + Pair otherKV = Pair(heapArr->warpV[j], heapArr->warpK[j]); shDumpKV[rowId * numOfNN + idx] = otherKV; } } } -template -DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds, - const IdxT m, const unsigned int numOfNN, - const IdxT starty) { +template +DI void storeWarpQGmem(myWarpSelect& heapArr, + OutT* out_dists, + IdxT* out_inds, + const IdxT m, + const unsigned int numOfNN, + const IdxT starty) +{ const int lid = raft::laneId(); #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { @@ -95,18 +106,21 @@ DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds, const auto idx = j * warpSize + lid; if (idx < numOfNN) { out_dists[gmemRowId * numOfNN + idx] = heapArr[i]->warpK[j]; - out_inds[gmemRowId * numOfNN + idx] = (IdxT)heapArr[i]->warpV[j]; + out_inds[gmemRowId * numOfNN + idx] = (IdxT)heapArr[i]->warpV[j]; } } } } } -template -DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists, - IdxT *out_inds, const IdxT m, - const unsigned int numOfNN, const IdxT starty) { +template +DI void loadPrevTopKsGmemWarpQ(myWarpSelect& heapArr, + OutT* out_dists, + IdxT* out_inds, + const IdxT m, + const unsigned int numOfNN, + const IdxT starty) +{ const int lid = raft::laneId(); #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { @@ -121,17 +135,17 @@ DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists, } } auto constexpr kLaneWarpKTop = heapArr[i]->kNumWarpQRegisters - 1; - heapArr[i]->warpKTop = - raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane); + heapArr[i]->warpKTop = raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane); } } } template -DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId, - int finalNumVals, int startId = 0) { +DI void updateSortedWarpQ( + myWarpSelect& heapArr, Pair* allWarpTopKs, int rowId, int finalNumVals, int startId = 0) +{ constexpr uint32_t mask = 0xffffffffu; - const int lid = raft::laneId(); + const int lid = raft::laneId(); // calculate srcLane such that tid 0 -> 31, 1 -> 0,... 31 -> 30. // warp around 0 to 31 required for NN > 32 const auto srcLane = (warpSize + (lid - 1)) & (warpSize - 1); @@ -140,12 +154,11 @@ DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId, Pair KVPair = allWarpTopKs[rowId * (256) + k]; #pragma unroll for (int i = 0; i < NumWarpQRegs; i++) { - unsigned activeLanes = - __ballot_sync(mask, KVPair.value < heapArr->warpK[i]); + unsigned activeLanes = __ballot_sync(mask, KVPair.value < heapArr->warpK[i]); if (activeLanes) { Pair tempKV; - tempKV.value = raft::shfl(heapArr->warpK[i], srcLane); - tempKV.key = raft::shfl(heapArr->warpV[i], srcLane); + tempKV.value = raft::shfl(heapArr->warpK[i], srcLane); + tempKV.key = raft::shfl(heapArr->warpV[i], srcLane); const auto firstActiveLane = __ffs(activeLanes) - 1; if (firstActiveLane == lid) { heapArr->warpK[i] = KVPair.value; @@ -168,43 +181,60 @@ DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId, } } -template -__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( - const DataT *x, const DataT *y, const DataT *_xn, const DataT *_yn, - const IdxT m, const IdxT n, const IdxT k, const IdxT lda, const IdxT ldb, - const IdxT ldd, CoreLambda core_op, FinalLambda fin_op, bool sqrt, - unsigned int numOfNN, int *mutexes, OutT *out_dists, IdxT *out_inds) { +template +__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x, + const DataT* y, + const DataT* _xn, + const DataT* _yn, + const IdxT m, + const IdxT n, + const IdxT k, + const IdxT lda, + const IdxT ldb, + const IdxT ldd, + CoreLambda core_op, + FinalLambda fin_op, + bool sqrt, + unsigned int numOfNN, + int* mutexes, + OutT* out_dists, + IdxT* out_inds) +{ extern __shared__ char smem[]; typedef cub::KeyValuePair Pair; constexpr auto identity = std::numeric_limits::max(); - constexpr auto keyMax = std::numeric_limits::max(); - constexpr auto Dir = false; - typedef faiss::gpu::WarpSelect< - AccT, uint32_t, Dir, faiss::gpu::Comparator, NumWarpQ, NumThreadQ, 32> - myWarpSelect; - - auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, - mutexes] __device__(IdxT gridStrideY) { - if (gridDim.x == 1) { - return; - } + constexpr auto keyMax = std::numeric_limits::max(); + constexpr auto Dir = false; + typedef faiss::gpu:: + WarpSelect, NumWarpQ, NumThreadQ, 32> + myWarpSelect; - volatile int *mutex = mutexes; + auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, mutexes] __device__( + IdxT gridStrideY) { + if (gridDim.x == 1) { return; } - Pair *shDumpKV = nullptr; + volatile int* mutex = mutexes; + + Pair* shDumpKV = nullptr; if (useNorms) { - shDumpKV = - (Pair *)(&smem[Policy::SmemSize + - ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]); + shDumpKV = (Pair*)(&smem[Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]); } else { - shDumpKV = (Pair *)(&smem[Policy::SmemSize]); + shDumpKV = (Pair*)(&smem[Policy::SmemSize]); } - const int lid = threadIdx.x % warpSize; + const int lid = threadIdx.x % warpSize; const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols); // 0 -> consumer done consuming the buffer. @@ -215,7 +245,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( auto cta_processed = 0; myWarpSelect heapArr1(identity, keyMax, numOfNN); myWarpSelect heapArr2(identity, keyMax, numOfNN); - myWarpSelect *heapArr[] = {&heapArr1, &heapArr2}; + myWarpSelect* heapArr[] = {&heapArr1, &heapArr2}; __syncwarp(); loadAllWarpQShmem(heapArr, &shDumpKV[0], m, numOfNN); @@ -224,7 +254,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( if (threadIdx.x == 0) { int32_t old = -3; while (old != -1) { - old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], -2, -1); + old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], -2, -1); } __threadfence(); } @@ -232,18 +262,17 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = starty + i * Policy::AccThRows; - const auto shMemRowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = starty + i * Policy::AccThRows; + const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; #pragma unroll for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) { Pair otherKV; - otherKV.value = identity; - otherKV.key = keyMax; + otherKV.value = identity; + otherKV.key = keyMax; const auto idx = j * warpSize + lid; if (idx < numOfNN && rowId < m) { - otherKV.value = out_dists[rowId * numOfNN + idx]; - otherKV.key = (uint32_t)out_inds[rowId * numOfNN + idx]; + otherKV.value = out_dists[rowId * numOfNN + idx]; + otherKV.key = (uint32_t)out_inds[rowId * numOfNN + idx]; shDumpKV[shMemRowId * numOfNN + idx] = otherKV; } } @@ -260,19 +289,16 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = starty + i * Policy::AccThRows; - const auto shMemRowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = starty + i * Policy::AccThRows; + const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (rowId < m) { #pragma unroll for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) { Pair otherKV; - otherKV.value = identity; - otherKV.key = keyMax; + otherKV.value = identity; + otherKV.key = keyMax; const auto idx = j * warpSize + lid; - if (idx < numOfNN) { - otherKV = shDumpKV[shMemRowId * numOfNN + idx]; - } + if (idx < numOfNN) { otherKV = shDumpKV[shMemRowId * numOfNN + idx]; } heapArr[i]->add(otherKV.value, otherKV.key); } } @@ -284,20 +310,17 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( const auto rowId = starty + i * Policy::AccThRows; if (rowId < m) { bool needSort = (heapArr[i]->numVals > 0); - needSort = __any_sync(0xffffffff, needSort); - if (needSort) { - heapArr[i]->reduce(); - } + needSort = __any_sync(0xffffffff, needSort); + if (needSort) { heapArr[i]->reduce(); } } } - storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, - starty); + storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, starty); } else { if (threadIdx.x == 0) { - int32_t old = -1; + int32_t old = -1; int32_t blkIdX = (int32_t)blockIdx.x; while (old != blkIdX) { - old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX); + old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX); } __threadfence(); } @@ -305,14 +328,13 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = starty + i * Policy::AccThRows; - const auto shMemRowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = starty + i * Policy::AccThRows; + const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (rowId < m) { for (int idx = lid; idx < numOfNN; idx += warpSize) { - Pair KVPair = shDumpKV[shMemRowId * numOfNN + idx]; + Pair KVPair = shDumpKV[shMemRowId * numOfNN + idx]; out_dists[rowId * numOfNN + idx] = KVPair.value; - out_inds[rowId * numOfNN + idx] = (IdxT)KVPair.key; + out_inds[rowId * numOfNN + idx] = (IdxT)KVPair.key; } } } @@ -328,7 +350,9 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( // epilogue operation lambda for final value calculation auto epilog_lambda = [numOfNN, m, n, ldd, out_dists, out_inds] __device__( AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT * regxn, DataT * regyn, IdxT gridStrideX, + DataT * regxn, + DataT * regyn, + IdxT gridStrideX, IdxT gridStrideY) { if (useNorms) { #pragma unroll @@ -340,36 +364,34 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( } } - Pair *shDumpKV = nullptr; + Pair* shDumpKV = nullptr; if (useNorms) { constexpr size_t shmemSize = Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT)); - shDumpKV = (Pair *)(&smem[shmemSize]); + shDumpKV = (Pair*)(&smem[shmemSize]); } else { - shDumpKV = (Pair *)(&smem[Policy::SmemSize]); + shDumpKV = (Pair*)(&smem[Policy::SmemSize]); } constexpr uint32_t mask = 0xffffffffu; - const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols); - const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols); - const int lid = raft::laneId(); + const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols); + const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols); + const int lid = raft::laneId(); myWarpSelect heapArr1(identity, keyMax, numOfNN); myWarpSelect heapArr2(identity, keyMax, numOfNN); - myWarpSelect *heapArr[] = {&heapArr1, &heapArr2}; + myWarpSelect* heapArr[] = {&heapArr1, &heapArr2}; if (usePrevTopKs) { if (gridStrideX == blockIdx.x * Policy::Nblk) { - loadPrevTopKsGmemWarpQ(heapArr, out_dists, out_inds, m, - numOfNN, starty); + loadPrevTopKsGmemWarpQ(heapArr, out_dists, out_inds, m, numOfNN, starty); } } if (gridStrideX > blockIdx.x * Policy::Nblk) { #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; - Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1]; + const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1]; heapArr[i]->warpKTop = tempKV.value; } @@ -378,16 +400,14 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( int anyWarpTopKs = 0; #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto rowId = starty + i * Policy::AccThRows; + const auto rowId = starty + i * Policy::AccThRows; numValsWarpTopK[i] = 0; if (rowId < m) { #pragma unroll for (int j = 0; j < Policy::AccColsPerTh; ++j) { const auto colId = startx + j * Policy::AccThCols; if (colId < ldd) { - if (acc[i][j] < heapArr[i]->warpKTop) { - numValsWarpTopK[i]++; - } + if (acc[i][j] < heapArr[i]->warpKTop) { numValsWarpTopK[i]++; } } } anyWarpTopKs += numValsWarpTopK[i]; @@ -395,24 +415,21 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( } anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0); if (anyWarpTopKs) { - Pair *allWarpTopKs = (Pair *)(&smem[0]); + Pair* allWarpTopKs = (Pair*)(&smem[0]); uint32_t needScanSort[Policy::AccRowsPerTh]; #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { const auto gmemRowId = starty + i * Policy::AccThRows; - needScanSort[i] = 0; + needScanSort[i] = 0; if (gmemRowId < m) { - int myVals = numValsWarpTopK[i]; + int myVals = numValsWarpTopK[i]; needScanSort[i] = __ballot_sync(mask, myVals > 0); if (needScanSort[i]) { #pragma unroll for (unsigned int k = 1; k <= 16; k *= 2) { - const unsigned int n = - __shfl_up_sync(mask, numValsWarpTopK[i], k); - if (lid >= k) { - numValsWarpTopK[i] += n; - } + const unsigned int n = __shfl_up_sync(mask, numValsWarpTopK[i], k); + if (lid >= k) { numValsWarpTopK[i] += n; } } } // As each thread will know its total vals to write. @@ -421,8 +438,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( } if (needScanSort[i]) { - const auto rowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (gmemRowId < m) { if (needScanSort[i] & ((uint32_t)1 << lid)) { #pragma unroll @@ -430,17 +446,15 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( const auto colId = startx + j * Policy::AccThCols; if (colId < ldd) { if (acc[i][j] < heapArr[i]->warpKTop) { - Pair otherKV = {colId, acc[i][j]}; - allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = - otherKV; + Pair otherKV = {colId, acc[i][j]}; + allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = otherKV; numValsWarpTopK[i]++; } } } } const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31); - loadWarpQShmem(heapArr[i], &shDumpKV[0], rowId, - numOfNN); + loadWarpQShmem(heapArr[i], &shDumpKV[0], rowId, numOfNN); updateSortedWarpQkNumWarpQRegisters>( heapArr[i], &allWarpTopKs[0], rowId, finalNumVals); } @@ -450,12 +464,10 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { if (needScanSort[i]) { - const auto rowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; const auto gmemRowId = starty + i * Policy::AccThRows; if (gmemRowId < m) { - storeWarpQShmem(heapArr[i], shDumpKV, rowId, - numOfNN); + storeWarpQShmem(heapArr[i], shDumpKV, rowId, numOfNN); } } } @@ -463,28 +475,24 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( } else { #pragma unroll for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - const auto gmemRowId = starty + i * Policy::AccThRows; - const auto shMemRowId = - (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; + const auto gmemRowId = starty + i * Policy::AccThRows; + const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows; if (gmemRowId < m) { #pragma unroll for (int j = 0; j < Policy::AccColsPerTh; ++j) { const auto colId = startx + j * Policy::AccThCols; - Pair otherKV = {keyMax, identity}; + Pair otherKV = {keyMax, identity}; if (colId < ldd) { otherKV.value = acc[i][j]; - otherKV.key = colId; + otherKV.key = colId; } heapArr[i]->add(otherKV.value, otherKV.key); } bool needSort = (heapArr[i]->numVals > 0); - needSort = __any_sync(mask, needSort); - if (needSort) { - heapArr[i]->reduce(); - } - storeWarpQShmem(heapArr[i], shDumpKV, shMemRowId, - numOfNN); + needSort = __any_sync(mask, needSort); + if (needSort) { heapArr[i]->reduce(); } + storeWarpQShmem(heapArr[i], shDumpKV, shMemRowId, numOfNN); } } } @@ -492,27 +500,64 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN( if (((gridStrideX + Policy::Nblk * gridDim.x) > n) && gridDim.x == 1) { // This is last iteration of grid stride X loadAllWarpQShmem(heapArr, &shDumpKV[0], m, numOfNN); - storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, - starty); + storeWarpQGmem(heapArr, out_dists, out_inds, m, numOfNN, starty); } }; - raft::distance::detail::PairwiseDistances< - useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda, - decltype(epilog_lambda), FinalLambda, decltype(rowEpilog_lambda), - isRowMajor, false> - obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, nullptr, smem, core_op, - epilog_lambda, fin_op, rowEpilog_lambda); + raft::distance::detail::PairwiseDistances + obj(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + _xn, + _yn, + nullptr, + smem, + core_op, + epilog_lambda, + fin_op, + rowEpilog_lambda); obj.run(); } -template -void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, - OutT *out_dists, IdxT *out_inds, IdxT numOfNN, - cudaStream_t stream, void *workspace, - size_t &worksize) { +template +void fusedL2UnexpKnnImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* out_dists, + IdxT* out_inds, + IdxT numOfNN, + cudaStream_t stream, + void* workspace, + size_t& worksize) +{ typedef typename raft::linalg::Policy2x8::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; @@ -532,12 +577,30 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, typedef cub::KeyValuePair Pair; if (isRowMajor) { - constexpr auto fusedL2UnexpKnn32RowMajor = - fusedL2kNN; - constexpr auto fusedL2UnexpKnn64RowMajor = - fusedL2kNN; + constexpr auto fusedL2UnexpKnn32RowMajor = fusedL2kNN; + constexpr auto fusedL2UnexpKnn64RowMajor = fusedL2kNN; auto fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor; if (numOfNN <= 32) { @@ -545,13 +608,11 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, } else if (numOfNN <= 64) { fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn64RowMajor; } else { - ASSERT(numOfNN <= 64, - "fusedL2kNN: num of nearest neighbors must be <= 64"); + ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64"); } - const auto sharedMemSize = - KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair)); - dim3 grid = raft::distance::detail::launchConfigGenerator( + const auto sharedMemSize = KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair)); + dim3 grid = raft::distance::detail::launchConfigGenerator( m, n, sharedMemSize, fusedL2UnexpKnnRowMajor); if (grid.x > 1) { @@ -560,51 +621,133 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, worksize = sizeof(int32_t) * numMutexes; return; } else { - CUDA_CHECK( - cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream)); + CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream)); } } - fusedL2UnexpKnnRowMajor<<>>( - x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt, - (uint32_t)numOfNN, (int *)workspace, out_dists, out_inds); + fusedL2UnexpKnnRowMajor<<>>(x, + y, + nullptr, + nullptr, + m, + n, + k, + lda, + ldb, + ldd, + core_lambda, + fin_op, + sqrt, + (uint32_t)numOfNN, + (int*)workspace, + out_dists, + out_inds); } else { } CUDA_CHECK(cudaGetLastError()); } -template -void fusedL2UnexpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, bool sqrt, OutT *out_dists, - IdxT *out_inds, IdxT numOfNN, cudaStream_t stream, - void *workspace, size_t &worksize) { +template +void fusedL2UnexpKnn(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + bool sqrt, + OutT* out_dists, + IdxT* out_inds, + IdxT numOfNN, + cudaStream_t stream, + void* workspace, + size_t& worksize) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - fusedL2UnexpKnnImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream, - workspace, worksize); + fusedL2UnexpKnnImpl( + x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - fusedL2UnexpKnnImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream, - workspace, worksize); + fusedL2UnexpKnnImpl( + x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } else { - fusedL2UnexpKnnImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream, - workspace, worksize); + fusedL2UnexpKnnImpl(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } } -template -void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, - IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *out_dists, - IdxT *out_inds, IdxT numOfNN, cudaStream_t stream, - void *workspace, size_t &worksize) { +template +void fusedL2ExpKnnImpl(const DataT* x, + const DataT* y, + IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + bool sqrt, + OutT* out_dists, + IdxT* out_inds, + IdxT numOfNN, + cudaStream_t stream, + void* workspace, + size_t& worksize) +{ typedef typename raft::linalg::Policy2x8::Policy RowPolicy; typedef typename raft::linalg::Policy4x4::ColPolicy ColPolicy; @@ -612,28 +755,43 @@ void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, ASSERT(isRowMajor, "Only Row major inputs are allowed"); - ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || - (worksize < m * sizeof(AccT))), + ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))), "workspace size error"); ASSERT(workspace != nullptr, "workspace is null"); dim3 blk(KPolicy::Nthreads); // Accumulation operation lambda - auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { - acc += x * y; - }; + auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; }; auto fin_op = [] __device__(AccT d_val, int g_d_idx) { return d_val; }; typedef cub::KeyValuePair Pair; if (isRowMajor) { - constexpr auto fusedL2ExpKnn32RowMajor = - fusedL2kNN; - constexpr auto fusedL2ExpKnn64RowMajor = - fusedL2kNN; + constexpr auto fusedL2ExpKnn32RowMajor = fusedL2kNN; + constexpr auto fusedL2ExpKnn64RowMajor = fusedL2kNN; auto fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor; if (numOfNN <= 32) { @@ -641,77 +799,137 @@ void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, } else if (numOfNN <= 64) { fusedL2ExpKnnRowMajor = fusedL2ExpKnn64RowMajor; } else { - ASSERT(numOfNN <= 64, - "fusedL2kNN: num of nearest neighbors must be <= 64"); + ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64"); } - const auto sharedMemSize = - KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) + - (KPolicy::Mblk * numOfNN * sizeof(Pair)); + const auto sharedMemSize = KPolicy::SmemSize + + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) + + (KPolicy::Mblk * numOfNN * sizeof(Pair)); dim3 grid = raft::distance::detail::launchConfigGenerator( m, n, sharedMemSize, fusedL2ExpKnnRowMajor); - int32_t *mutexes = nullptr; + int32_t* mutexes = nullptr; if (grid.x > 1) { - const auto numMutexes = raft::ceildiv(m, KPolicy::Mblk); - const auto normsSize = - (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT); + const auto numMutexes = raft::ceildiv(m, KPolicy::Mblk); + const auto normsSize = (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT); const auto requiredSize = sizeof(int32_t) * numMutexes + normsSize; if (worksize < requiredSize) { worksize = requiredSize; return; } else { - mutexes = (int32_t *)((char *)workspace + normsSize); - CUDA_CHECK( - cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream)); + mutexes = (int32_t*)((char*)workspace + normsSize); + CUDA_CHECK(cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream)); } } - DataT *xn = (DataT *)workspace; - DataT *yn = (DataT *)workspace; + DataT* xn = (DataT*)workspace; + DataT* yn = (DataT*)workspace; auto norm_op = [] __device__(DataT in) { return in; }; if (x != y) { yn += m; - raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); - raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op); + raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } else { - raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor, - stream, norm_op); + raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op); } - fusedL2ExpKnnRowMajor<<>>( - x, y, xn, yn, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt, - (uint32_t)numOfNN, mutexes, out_dists, out_inds); + fusedL2ExpKnnRowMajor<<>>(x, + y, + xn, + yn, + m, + n, + k, + lda, + ldb, + ldd, + core_lambda, + fin_op, + sqrt, + (uint32_t)numOfNN, + mutexes, + out_dists, + out_inds); } else { } CUDA_CHECK(cudaGetLastError()); } -template -void fusedL2ExpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, - const DataT *x, const DataT *y, bool sqrt, OutT *out_dists, - IdxT *out_inds, IdxT numOfNN, cudaStream_t stream, - void *workspace, size_t &worksize) { +template +void fusedL2ExpKnn(IdxT m, + IdxT n, + IdxT k, + IdxT lda, + IdxT ldb, + IdxT ldd, + const DataT* x, + const DataT* y, + bool sqrt, + OutT* out_dists, + IdxT* out_inds, + IdxT numOfNN, + cudaStream_t stream, + void* workspace, + size_t& worksize) +{ size_t bytesA = sizeof(DataT) * lda; size_t bytesB = sizeof(DataT) * ldb; if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) { - fusedL2ExpKnnImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, - out_inds, numOfNN, stream, workspace, - worksize); + fusedL2ExpKnnImpl( + x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) { - fusedL2ExpKnnImpl(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, - out_inds, numOfNN, stream, workspace, - worksize); + fusedL2ExpKnnImpl( + x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } else { - fusedL2ExpKnnImpl( - x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream, - workspace, worksize); + fusedL2ExpKnnImpl(x, + y, + m, + n, + k, + lda, + ldb, + ldd, + sqrt, + out_dists, + out_inds, + numOfNN, + stream, + workspace, + worksize); } } @@ -732,11 +950,19 @@ void fusedL2ExpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, * @param[in] stream stream to order kernel launch */ template -void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, size_t n_index_rows, - size_t n_query_rows, int k, bool rowMajorIndex, - bool rowMajorQuery, cudaStream_t stream, - raft::distance::DistanceType metric) { +void fusedL2Knn(size_t D, + value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + size_t n_query_rows, + int k, + bool rowMajorIndex, + bool rowMajorQuery, + cudaStream_t stream, + raft::distance::DistanceType metric) +{ // Validate the input data ASSERT(k > 0, "l2Knn: k must be > 0"); ASSERT(D > 0, "l2Knn: D must be > 0"); @@ -750,8 +976,7 @@ void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists, ASSERT(rowMajorIndex == rowMajorQuery, "l2Knn: rowMajorIndex and rowMajorQuery should have same layout"); // TODO: Add support for column major layout - ASSERT(rowMajorIndex == true, - "l2Knn: only rowMajor inputs are supported for now."); + ASSERT(rowMajorIndex == true, "l2Knn: only rowMajor inputs are supported for now."); // Even for L2 Sqrt distance case we use non-sqrt version as FAISS bfKNN only support // non-sqrt metric & some tests in RAFT/cuML (like Linkage) fails if we use L2 sqrt. @@ -764,37 +989,82 @@ void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists, switch (metric) { case raft::distance::DistanceType::L2SqrtExpanded: case raft::distance::DistanceType::L2Expanded: - tempWorksize = raft::distance::detail::getWorkspaceSize< - raft::distance::DistanceType::L2Expanded, float, float, float, - value_idx>(query, index, n_query_rows, n_index_rows, D); + tempWorksize = raft::distance::detail:: + getWorkspaceSize( + query, index, n_query_rows, n_index_rows, D); worksize = tempWorksize; workspace.resize(worksize, stream); - fusedL2ExpKnn( - n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt, - out_dists, out_inds, k, stream, workspace.data(), worksize); + fusedL2ExpKnn(n_query_rows, + n_index_rows, + D, + lda, + ldb, + ldd, + query, + index, + sqrt, + out_dists, + out_inds, + k, + stream, + workspace.data(), + worksize); if (worksize > tempWorksize) { workspace.resize(worksize, stream); - fusedL2ExpKnn( - n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt, - out_dists, out_inds, k, stream, workspace.data(), worksize); + fusedL2ExpKnn(n_query_rows, + n_index_rows, + D, + lda, + ldb, + ldd, + query, + index, + sqrt, + out_dists, + out_inds, + k, + stream, + workspace.data(), + worksize); } break; case raft::distance::DistanceType::L2Unexpanded: case raft::distance::DistanceType::L2SqrtUnexpanded: - fusedL2UnexpKnn( - n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt, - out_dists, out_inds, k, stream, workspace.data(), worksize); + fusedL2UnexpKnn(n_query_rows, + n_index_rows, + D, + lda, + ldb, + ldd, + query, + index, + sqrt, + out_dists, + out_inds, + k, + stream, + workspace.data(), + worksize); if (worksize) { workspace.resize(worksize, stream); - fusedL2UnexpKnn(n_query_rows, n_index_rows, D, lda, ldb, ldd, - query, index, sqrt, out_dists, out_inds, k, - stream, workspace.data(), worksize); + fusedL2UnexpKnn(n_query_rows, + n_index_rows, + D, + lda, + ldb, + ldd, + query, + index, + sqrt, + out_dists, + out_inds, + k, + stream, + workspace.data(), + worksize); } break; - default: - printf("only L2 distance metric is supported\n"); - break; + default: printf("only L2 distance metric is supported\n"); break; }; } diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh index 7d87254cb6..049c11514c 100644 --- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh +++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh @@ -35,7 +35,8 @@ namespace knn { namespace detail { template -DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { +DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) +{ value_t sin_0 = sin(0.5 * (x1 - y1)); value_t sin_1 = sin(0.5 * (x2 - y2)); value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1; @@ -56,34 +57,36 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) { * @param[in] n_index_rows number of rows in index array * @param[in] k number of closest neighbors to return */ -template -__global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, - size_t n_index_rows, int k) { +template +__global__ void haversine_knn_kernel(value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + int k) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; __shared__ value_idx smemV[kNumWarps * warp_q]; - faiss::gpu::BlockSelect, warp_q, thread_q, - tpb> - heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); // Grid is exactly sized to rows available int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize); - const value_t *query_ptr = query + (blockIdx.x * 2); - value_t x1 = query_ptr[0]; - value_t x2 = query_ptr[1]; + const value_t* query_ptr = query + (blockIdx.x * 2); + value_t x1 = query_ptr[0]; + value_t x2 = query_ptr[1]; int i = threadIdx.x; for (; i < limit; i += tpb) { - const value_t *idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t* idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -92,9 +95,9 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, // Handle last remainder fraction of a warp of elements if (i < n_index_rows) { - const value_t *idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; + const value_t* idx_ptr = index + (i * 2); + value_t y1 = idx_ptr[0]; + value_t y2 = idx_ptr[1]; value_t dist = compute_haversine(x1, y1, x2, y2); @@ -105,7 +108,7 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, for (int i = threadIdx.x; i < k; i += tpb) { out_dists[blockIdx.x * k + i] = smemK[i]; - out_inds[blockIdx.x * k + i] = smemV[i]; + out_inds[blockIdx.x * k + i] = smemV[i]; } } @@ -126,10 +129,15 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists, * @param[in] stream stream to order kernel launch */ template -void haversine_knn(value_idx *out_inds, value_t *out_dists, - const value_t *index, const value_t *query, - size_t n_index_rows, size_t n_query_rows, int k, - cudaStream_t stream) { +void haversine_knn(value_idx* out_inds, + value_t* out_dists, + const value_t* index, + const value_t* query, + size_t n_index_rows, + size_t n_query_rows, + int k, + cudaStream_t stream) +{ haversine_knn_kernel<<>>( out_inds, out_dists, index, query, n_index_rows, k); } diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index da1217e3cf..2866049188 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -46,13 +46,22 @@ namespace spatial { namespace knn { namespace detail { -template -__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, - value_t *outK, value_idx *outV, - size_t n_samples, int n_parts, - value_t initK, value_idx initV, int k, - value_idx *translations) { +template +__global__ void knn_merge_parts_kernel(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + value_t initK, + value_idx initV, + int k, + value_idx* translations) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; @@ -61,34 +70,33 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, /** * Uses shared memory */ - faiss::gpu::BlockSelect, warp_q, thread_q, - tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available - int row = blockIdx.x; + int row = blockIdx.x; int total_k = k * n_parts; int i = threadIdx.x; // Get starting pointers for cols in current thread - int part = i / k; + int part = i / k; size_t row_idx = (row * k) + (part * n_samples * k); int col = i % k; - value_t *inKStart = inK + (row_idx + col); - value_idx *inVStart = inV + (row_idx + col); + value_t* inKStart = inK + (row_idx + col); + value_idx* inVStart = inV + (row_idx + col); - int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); + int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize); value_idx translation = 0; for (; i < limit; i += tpb) { translation = translations[part]; heap.add(*inKStart, (*inVStart) + translation); - part = (i + tpb) / k; + part = (i + tpb) / k; row_idx = (row * k) + (part * n_samples * k); col = (i + tpb) % k; @@ -111,22 +119,27 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV, } } -template -inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { +template +inline void knn_merge_parts_impl(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ auto grid = dim3(n_samples); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); auto kInit = faiss::gpu::Limits::getMax(); auto vInit = -1; knn_merge_parts_kernel - <<>>(inK, inV, outK, outV, n_samples, n_parts, - kInit, vInit, k, translations); + <<>>( + inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations); CUDA_CHECK(cudaPeekAtLastError()); } @@ -145,10 +158,16 @@ inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK, * @param translations mapping of index offsets for each partition */ template -inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { +inline void knn_merge_parts(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ if (k == 1) knn_merge_parts_impl( inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); @@ -197,26 +216,32 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm */ template -void brute_force_knn_impl(std::vector &input, - std::vector &sizes, IntType D, - float *search_items, IntType n, IdxType *res_I, - float *res_D, IntType k, cudaStream_t userStream, - cudaStream_t *internalStreams = nullptr, - int n_int_streams = 0, bool rowMajorIndex = true, - bool rowMajorQuery = true, - std::vector *translations = nullptr, - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2Expanded, - float metricArg = 0) { - ASSERT(input.size() == sizes.size(), - "input and sizes vectors should be the same size"); - - std::vector *id_ranges; +void brute_force_knn_impl( + std::vector& input, + std::vector& sizes, + IntType D, + float* search_items, + IntType n, + IdxType* res_I, + float* res_D, + IntType k, + cudaStream_t userStream, + cudaStream_t* internalStreams = nullptr, + int n_int_streams = 0, + bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector* translations = nullptr, + raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded, + float metricArg = 0) +{ + ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size"); + + std::vector* id_ranges; if (translations == nullptr) { // If we don't have explicit translations // for offsets of the indices, build them // from the local partitions - id_ranges = new std::vector(); + id_ranges = new std::vector(); IdxType total_n = 0; for (size_t i = 0; i < input.size(); i++) { id_ranges->push_back(total_n); @@ -232,11 +257,10 @@ void brute_force_knn_impl(std::vector &input, create_processor(metric, n, D, k, rowMajorQuery, userStream); query_metric_processor->preprocess(search_items); - std::vector>> metric_processors( - input.size()); + std::vector>> metric_processors(input.size()); for (size_t i = 0; i < input.size(); i++) { - metric_processors[i] = create_processor(metric, sizes[i], D, k, - rowMajorQuery, userStream); + metric_processors[i] = + create_processor(metric, sizes[i], D, k, rowMajorQuery, userStream); metric_processors[i]->preprocess(input[i]); } @@ -244,14 +268,13 @@ void brute_force_knn_impl(std::vector &input, CUDA_CHECK(cudaGetDevice(&device)); rmm::device_uvector trans(id_ranges->size(), userStream); - raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), - userStream); + raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream); rmm::device_uvector all_D(0, userStream); rmm::device_uvector all_I(0, userStream); - float *out_D = res_D; - IdxType *out_I = res_I; + float* out_D = res_D; + IdxType* out_I = res_I; if (input.size() > 1) { all_D.resize(input.size() * k * n, userStream); @@ -265,19 +288,28 @@ void brute_force_knn_impl(std::vector &input, if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream)); for (size_t i = 0; i < input.size(); i++) { - float *out_d_ptr = out_D + (i * k * n); - IdxType *out_i_ptr = out_I + (i * k * n); + float* out_d_ptr = out_D + (i * k * n); + IdxType* out_i_ptr = out_I + (i * k * n); - cudaStream_t stream = - raft::select_stream(userStream, internalStreams, n_int_streams, i); + cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i); if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true && (metric == raft::distance::DistanceType::L2Unexpanded || metric == raft::distance::DistanceType::L2SqrtUnexpanded || metric == raft::distance::DistanceType::L2Expanded || metric == raft::distance::DistanceType::L2SqrtExpanded)) { - fusedL2Knn(D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, - k, rowMajorIndex, rowMajorQuery, stream, metric); + fusedL2Knn(D, + out_i_ptr, + out_d_ptr, + input[i], + search_items, + sizes[i], + n, + k, + rowMajorIndex, + rowMajorQuery, + stream, + metric); } else { switch (metric) { case raft::distance::DistanceType::Haversine: @@ -286,8 +318,7 @@ void brute_force_knn_impl(std::vector &input, "Haversine distance requires 2 dimensions " "(latitude / longitude)."); - haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], - n, k, stream); + haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream); break; default: faiss::MetricType m = build_faiss_metric(metric); @@ -298,18 +329,18 @@ void brute_force_knn_impl(std::vector &input, gpu_res.setDefaultStream(device, stream); faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = metricArg; - args.k = k; - args.dims = D; - args.vectors = input[i]; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; args.vectorsRowMajor = rowMajorIndex; - args.numVectors = sizes[i]; - args.queries = search_items; + args.numVectors = sizes[i]; + args.queries = search_items; args.queriesRowMajor = rowMajorQuery; - args.numQueries = n; - args.outDistances = out_d_ptr; - args.outIndices = out_i_ptr; + args.numQueries = n; + args.outDistances = out_d_ptr; + args.outIndices = out_i_ptr; /** * @todo: Until FAISS supports pluggable allocation strategies, @@ -333,8 +364,7 @@ void brute_force_knn_impl(std::vector &input, if (input.size() > 1 || translations != nullptr) { // This is necessary for proper index translations. If there are // no translations or partitions to combine, it can be skipped. - knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, - trans.data()); + knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data()); } // Perform necessary post-processing @@ -342,14 +372,12 @@ void brute_force_knn_impl(std::vector &input, metric == raft::distance::DistanceType::L2SqrtUnexpanded || metric == raft::distance::DistanceType::LpUnexpanded) { /** - * post-processing - */ + * post-processing + */ float p = 0.5; // standard l2 - if (metric == raft::distance::DistanceType::LpUnexpanded) - p = 1.0 / metricArg; + if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg; raft::linalg::unaryOp( - res_D, res_D, n * k, - [p] __device__(float input) { return powf(input, p); }, userStream); + res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream); } query_metric_processor->revert(search_items); diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index b66ea025a2..f87fffc6cf 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -37,11 +37,11 @@ namespace knn { template class MetricProcessor { public: - virtual void preprocess(math_t *data) {} + virtual void preprocess(math_t* data) {} - virtual void revert(math_t *data) {} + virtual void revert(math_t* data) {} - virtual void postprocess(math_t *data) {} + virtual void postprocess(math_t* data) {} virtual ~MetricProcessor() = default; }; @@ -57,37 +57,57 @@ class CosineMetricProcessor : public MetricProcessor { rmm::device_uvector colsums_; public: - CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, - cudaStream_t stream) + CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream) : stream_(stream), colsums_(n_rows, stream), n_cols_(n_cols), n_rows_(n_rows), row_major_(row_major), - k_(k) {} + k_(k) + { + } - void preprocess(math_t *data) { - raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_, - raft::linalg::NormType::L2Norm, row_major_, stream_, + void preprocess(math_t* data) + { + raft::linalg::rowNorm(colsums_.data(), + data, + n_cols_, + n_rows_, + raft::linalg::NormType::L2Norm, + row_major_, + stream_, [] __device__(math_t in) { return sqrtf(in); }); raft::linalg::matrixVectorOp( - data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + data, + data, + colsums_.data(), + n_cols_, + n_rows_, + row_major_, + false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; }, stream_); } - void revert(math_t *data) { + void revert(math_t* data) + { raft::linalg::matrixVectorOp( - data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false, + data, + data, + colsums_.data(), + n_cols_, + n_rows_, + row_major_, + false, [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; }, stream_); } - void postprocess(math_t *data) { + void postprocess(math_t* data) + { raft::linalg::unaryOp( - data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, - stream_); + data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_); } ~CosineMetricProcessor() = default; @@ -98,41 +118,59 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { using cosine = CosineMetricProcessor; public: - CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k, - bool row_major, cudaStream_t stream) - : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream), - means_(n_rows, stream) {} + CorrelationMetricProcessor( + size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream) + : CosineMetricProcessor(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream) + { + } - void preprocess(math_t *data) { + void preprocess(math_t* data) + { math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_; - raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_, - (math_t)0.0, cosine::row_major_, true, + raft::linalg::reduce(means_.data(), + data, + cosine::n_cols_, + cosine::n_rows_, + (math_t)0.0, + cosine::row_major_, + true, cosine::stream_); raft::linalg::unaryOp( - means_.data(), means_.data(), cosine::n_rows_, + means_.data(), + means_.data(), + cosine::n_rows_, [=] __device__(math_t in) { return in * normalizer_const; }, cosine::stream_); - raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_, - cosine::n_rows_, cosine::row_major_, false, + raft::stats::meanCenter(data, + data, + means_.data(), + cosine::n_cols_, + cosine::n_rows_, + cosine::row_major_, + false, cosine::stream_); CosineMetricProcessor::preprocess(data); } - void revert(math_t *data) { + void revert(math_t* data) + { CosineMetricProcessor::revert(data); - raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_, - cosine::n_rows_, cosine::row_major_, false, + raft::stats::meanAdd(data, + data, + means_.data(), + cosine::n_cols_, + cosine::n_rows_, + cosine::row_major_, + false, cosine::stream_); } - void postprocess(math_t *data) { - CosineMetricProcessor::postprocess(data); - } + void postprocess(math_t* data) { CosineMetricProcessor::postprocess(data); } ~CorrelationMetricProcessor() = default; @@ -142,33 +180,30 @@ class CorrelationMetricProcessor : public CosineMetricProcessor { template class DefaultMetricProcessor : public MetricProcessor { public: - void preprocess(math_t *data) {} + void preprocess(math_t* data) {} - void revert(math_t *data) {} + void revert(math_t* data) {} - void postprocess(math_t *data) {} + void postprocess(math_t* data) {} ~DefaultMetricProcessor() = default; }; template inline std::unique_ptr> create_processor( - distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, - cudaStream_t userStream) { - MetricProcessor *mp = nullptr; + distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, cudaStream_t userStream) +{ + MetricProcessor* mp = nullptr; switch (metric) { case distance::DistanceType::CosineExpanded: - mp = - new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream); + mp = new CosineMetricProcessor(n, D, k, rowMajorQuery, userStream); break; case distance::DistanceType::CorrelationExpanded: - mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, - userStream); + mp = new CorrelationMetricProcessor(n, D, k, rowMajorQuery, userStream); break; - default: - mp = new DefaultMetricProcessor(); + default: mp = new DefaultMetricProcessor(); } return std::unique_ptr>(mp); diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh index 045edad0e6..88fa58a4d7 100644 --- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh @@ -31,27 +31,33 @@ namespace spatial { namespace knn { namespace detail { -template -__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, - size_t n_cols, K *outK, IndexType *outV, - K initK, IndexType initV, int k) { +template +__global__ void select_k_kernel(K* inK, + IndexType* inV, + size_t n_rows, + size_t n_cols, + K* outK, + IndexType* outV, + K initK, + IndexType initV, + int k) +{ constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; __shared__ K smemK[kNumWarps * warp_q]; __shared__ IndexType smemV[kNumWarps * warp_q]; - faiss::gpu::BlockSelect, - warp_q, thread_q, tpb> - heap(initK, initV, smemK, smemV, k); + faiss::gpu:: + BlockSelect, warp_q, thread_q, tpb> + heap(initK, initV, smemK, smemV, k); // Grid is exactly sized to rows available int row = blockIdx.x; - int i = threadIdx.x; + int i = threadIdx.x; - int idx = row * n_cols; - K *inKStart = inK + idx + i; - IndexType *inVStart = inV + idx + i; + int idx = row * n_cols; + K* inKStart = inK + idx + i; + IndexType* inVStart = inV + idx + i; // Whole warps must participate in the selection int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize); @@ -78,27 +84,31 @@ __global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows, } } -template -inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, - size_t n_cols, value_t *outK, value_idx *outV, - bool select_min, int k, cudaStream_t stream) { +template +inline void select_k_impl(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ auto grid = dim3(n_rows); constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; - auto block = dim3(n_threads); + auto block = dim3(n_threads); - auto kInit = select_min ? faiss::gpu::Limits::getMax() - : faiss::gpu::Limits::getMin(); + auto kInit = + select_min ? faiss::gpu::Limits::getMax() : faiss::gpu::Limits::getMin(); auto vInit = -1; if (select_min) { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, - vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); } else { select_k_kernel - <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, - vInit, k); + <<>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k); } CUDA_CHECK(cudaGetLastError()); } @@ -118,30 +128,37 @@ inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows, * @param[in] stream CUDA stream to use */ template -inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, - value_t *outK, value_idx *outV, bool select_min, int k, - cudaStream_t stream) { +inline void select_k(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ if (k == 1) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 32) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 64) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 128) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 256) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 512) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); else if (k <= 1024) - select_k_impl(inK, inV, n_rows, n_cols, outK, - outV, select_min, k, stream); + select_k_impl( + inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); } }; // namespace detail diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh index 84719a0e4b..abc4cdf545 100644 --- a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh @@ -30,21 +30,25 @@ struct KeyValuePair { __host__ __device__ __forceinline__ KeyValuePair() {} /// Copy Constructors - __host__ __device__ __forceinline__ - KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp) - : key(kvp.key), value(kvp.value) {} + __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp) + : key(kvp.key), value(kvp.value) + { + } - __host__ __device__ __forceinline__ - KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp) - : key(kvp.key), value(kvp.value) {} + __host__ __device__ __forceinline__ KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp) + : key(kvp.key), value(kvp.value) + { + } /// Constructor - __host__ __device__ __forceinline__ KeyValuePair(Key const& key, - Value const& value) - : key(key), value(value) {} + __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) + : key(key), value(value) + { + } /// Inequality operator - __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) { + __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) + { return (value != b.value) || (key != b.key); } }; @@ -117,9 +121,9 @@ struct KeyValuePair { // // If IsBitonic is false, the first stage is reversed, so we don't // need to sort directionally. It's still technically a bitonic sort. -template -inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) { +template +inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) +{ static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); static_assert(L <= kWarpSize / 2, "merge list size must be <= 16"); @@ -129,7 +133,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) { // Reverse the first comparison stage. // For example, merging a list of size 8 has the exchanges: // 0 <-> 15, 1 <-> 14, ... - K otherK = shfl_xor(k, 2 * L - 1); + K otherK = shfl_xor(k, 2 * L - 1); K otherVk = shfl_xor(v.key, 2 * L - 1); V otherVv = shfl_xor(v.value, 2 * L - 1); @@ -157,7 +161,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) { #pragma unroll for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) { - K otherK = shfl_xor(k, stride); + K otherK = shfl_xor(k, stride); K otherVk = shfl_xor(v.key, stride); V otherVv = shfl_xor(v.value, stride); @@ -183,9 +187,9 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair& v) { // Template for performing a bitonic merge of an arbitrary set of // registers -template -struct BitonicMergeStepKVP {}; +template +struct BitonicMergeStepKVP { +}; // // Power-of-2 merge specialization @@ -194,7 +198,8 @@ struct BitonicMergeStepKVP {}; // All merges eventually call this template struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[1], KeyValuePair v[1]) { + static inline __device__ void merge(K k[1], KeyValuePair v[1]) + { // Use warp shuffles warpBitonicMergeLE16KVP(k[0], v[0]); } @@ -202,16 +207,17 @@ struct BitonicMergeStepKVP { template struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[N], KeyValuePair v[N]) { + static inline __device__ void merge(K k[N], KeyValuePair v[N]) + { static_assert(utils::isPowerOf2(N), "must be power of 2"); static_assert(N > 1, "must be N > 1"); #pragma unroll for (int i = 0; i < N / 2; ++i) { - K& ka = k[i]; + K& ka = k[i]; KeyValuePair& va = v[i]; - K& kb = k[i + N / 2]; + K& kb = k[i + N / 2]; KeyValuePair& vb = v[i + N / 2]; bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); @@ -226,18 +232,17 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N / 2; ++i) { - newK[i] = k[i]; - newV[i].key = v[i].key; + newK[i] = k[i]; + newV[i].key = v[i].key; newV[i].value = v[i].value; } - BitonicMergeStepKVP::merge(newK, - newV); + BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < N / 2; ++i) { - k[i] = newK[i]; - v[i].key = newV[i].key; + k[i] = newK[i]; + v[i].key = newV[i].key; v[i].value = newV[i].value; } } @@ -248,18 +253,17 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N / 2; ++i) { - newK[i] = k[i + N / 2]; - newV[i].key = v[i + N / 2].key; + newK[i] = k[i + N / 2]; + newV[i].key = v[i + N / 2].key; newV[i].value = v[i + N / 2].value; } - BitonicMergeStepKVP::merge(newK, - newV); + BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < N / 2; ++i) { - k[i + N / 2] = newK[i]; - v[i + N / 2].key = newV[i].key; + k[i + N / 2] = newK[i]; + v[i + N / 2].key = newV[i].key; v[i + N / 2].value = newV[i].value; } } @@ -273,7 +277,8 @@ struct BitonicMergeStepKVP { // Low recursion template struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[N], KeyValuePair v[N]) { + static inline __device__ void merge(K k[N], KeyValuePair v[N]) + { static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); static_assert(N >= 3, "must be N >= 3"); @@ -281,10 +286,10 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { - K& ka = k[i]; + K& ka = k[i]; KeyValuePair& va = v[i]; - K& kb = k[i + kNextHighestPowerOf2 / 2]; + K& kb = k[i + kNextHighestPowerOf2 / 2]; KeyValuePair& vb = v[i + kNextHighestPowerOf2 / 2]; bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); @@ -293,7 +298,7 @@ struct BitonicMergeStepKVP { swap(s, va.value, vb.value); } - constexpr int kLowSize = N - kNextHighestPowerOf2 / 2; + constexpr int kLowSize = N - kNextHighestPowerOf2 / 2; constexpr int kHighSize = kNextHighestPowerOf2 / 2; { K newK[kLowSize]; @@ -301,23 +306,26 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < kLowSize; ++i) { - newK[i] = k[i]; - newV[i].key = v[i].key; + newK[i] = k[i]; + newV[i].key = v[i].key; newV[i].value = v[i].value; } - constexpr bool kLowIsPowerOf2 = - utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); + constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); - BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < kLowSize; ++i) { - k[i] = newK[i]; - v[i].key = newV[i].key; + k[i] = newK[i]; + v[i].key = newV[i].key; v[i].value = newV[i].value; } } @@ -328,23 +336,26 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < kHighSize; ++i) { - newK[i] = k[i + kLowSize]; - newV[i].key = v[i + kLowSize].key; + newK[i] = k[i + kLowSize]; + newV[i].key = v[i + kLowSize].key; newV[i].value = v[i + kLowSize].value; } - constexpr bool kHighIsPowerOf2 = - utils::isPowerOf2(kNextHighestPowerOf2 / 2); + constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize); - BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < kHighSize; ++i) { - k[i + kLowSize] = newK[i]; - v[i + kLowSize].key = newV[i].key; + k[i + kLowSize] = newK[i]; + v[i + kLowSize].key = newV[i].key; v[i + kLowSize].value = newV[i].value; } } @@ -354,7 +365,8 @@ struct BitonicMergeStepKVP { // High recursion template struct BitonicMergeStepKVP { - static inline __device__ void merge(K k[N], KeyValuePair v[N]) { + static inline __device__ void merge(K k[N], KeyValuePair v[N]) + { static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); static_assert(N >= 3, "must be N >= 3"); @@ -362,10 +374,10 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { - K& ka = k[i]; + K& ka = k[i]; KeyValuePair& va = v[i]; - K& kb = k[i + kNextHighestPowerOf2 / 2]; + K& kb = k[i + kNextHighestPowerOf2 / 2]; KeyValuePair& vb = v[i + kNextHighestPowerOf2 / 2]; bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); @@ -374,7 +386,7 @@ struct BitonicMergeStepKVP { swap(s, va.value, vb.value); } - constexpr int kLowSize = kNextHighestPowerOf2 / 2; + constexpr int kLowSize = kNextHighestPowerOf2 / 2; constexpr int kHighSize = N - kNextHighestPowerOf2 / 2; { K newK[kLowSize]; @@ -382,23 +394,26 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < kLowSize; ++i) { - newK[i] = k[i]; - newV[i].key = v[i].key; + newK[i] = k[i]; + newV[i].key = v[i].key; newV[i].value = v[i].value; } - constexpr bool kLowIsPowerOf2 = - utils::isPowerOf2(kNextHighestPowerOf2 / 2); + constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); - BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < kLowSize; ++i) { - k[i] = newK[i]; - v[i].key = newV[i].key; + k[i] = newK[i]; + v[i].key = newV[i].key; v[i].value = newV[i].value; } } @@ -409,23 +424,26 @@ struct BitonicMergeStepKVP { #pragma unroll for (int i = 0; i < kHighSize; ++i) { - newK[i] = k[i + kLowSize]; - newV[i].key = v[i + kLowSize].key; + newK[i] = k[i + kLowSize]; + newV[i].key = v[i + kLowSize].key; newV[i].value = v[i + kLowSize].value; } - constexpr bool kHighIsPowerOf2 = - utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); + constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); // FIXME: compiler doesn't like this expression? compiler bug? // constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize); - BitonicMergeStepKVP::merge(newK, newV); #pragma unroll for (int i = 0; i < kHighSize; ++i) { - k[i + kLowSize] = newK[i]; - v[i + kLowSize].key = newV[i].key; + k[i + kLowSize] = newK[i]; + v[i + kLowSize].key = newV[i].key; v[i + kLowSize].value = newV[i].value; } } @@ -436,20 +454,20 @@ struct BitonicMergeStepKVP { /// i.e., merges a sorted k/v list of size kWarpSize * N1 with a /// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any /// value >= 1 -template +template inline __device__ void warpMergeAnyRegistersKVP(K k1[N1], KeyValuePair v1[N1], K k2[N2], - KeyValuePair v2[N2]) { + KeyValuePair v2[N2]) +{ constexpr int kSmallestN = N1 < N2 ? N1 : N2; #pragma unroll for (int i = 0; i < kSmallestN; ++i) { - K& ka = k1[N1 - 1 - i]; + K& ka = k1[N1 - 1 - i]; KeyValuePair& va = v1[N1 - 1 - i]; - K& kb = k2[i]; + K& kb = k2[i]; KeyValuePair& vb = v2[i]; K otherKa; @@ -457,13 +475,13 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1], if (FullMerge) { // We need the other values - otherKa = shfl_xor(ka, kWarpSize - 1); + otherKa = shfl_xor(ka, kWarpSize - 1); K otherVak = shfl_xor(va.key, kWarpSize - 1); V otherVav = shfl_xor(va.value, kWarpSize - 1); - otherVa = KeyValuePair(otherVak, otherVav); + otherVa = KeyValuePair(otherVak, otherVav); } - K otherKb = shfl_xor(kb, kWarpSize - 1); + K otherKb = shfl_xor(kb, kWarpSize - 1); K otherVbk = shfl_xor(vb.key, kWarpSize - 1); V otherVbv = shfl_xor(vb.value, kWarpSize - 1); @@ -487,12 +505,10 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1], } } - BitonicMergeStepKVP::merge( - k1, v1); + BitonicMergeStepKVP::merge(k1, v1); if (FullMerge) { // Only if we care about N2 do we need to bother merging it fully - BitonicMergeStepKVP::merge(k2, v2); + BitonicMergeStepKVP::merge(k2, v2); } } @@ -500,7 +516,8 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1], // bitonic sort template struct BitonicSortStepKVP { - static inline __device__ void sort(K k[N], KeyValuePair v[N]) { + static inline __device__ void sort(K k[N], KeyValuePair v[N]) + { static_assert(N > 1, "did not hit specialized case"); // Sort recursively @@ -512,8 +529,8 @@ struct BitonicSortStepKVP { #pragma unroll for (int i = 0; i < kSizeA; ++i) { - aK[i] = k[i]; - aV[i].key = v[i].key; + aK[i] = k[i]; + aV[i].key = v[i].key; aV[i].value = v[i].value; } @@ -524,8 +541,8 @@ struct BitonicSortStepKVP { #pragma unroll for (int i = 0; i < kSizeB; ++i) { - bK[i] = k[i + kSizeA]; - bV[i].key = v[i + kSizeA].key; + bK[i] = k[i + kSizeA]; + bV[i].key = v[i + kSizeA].key; bV[i].value = v[i + kSizeA].value; } @@ -536,15 +553,15 @@ struct BitonicSortStepKVP { #pragma unroll for (int i = 0; i < kSizeA; ++i) { - k[i] = aK[i]; - v[i].key = aV[i].key; + k[i] = aK[i]; + v[i].key = aV[i].key; v[i].value = aV[i].value; } #pragma unroll for (int i = 0; i < kSizeB; ++i) { - k[i + kSizeA] = bK[i]; - v[i + kSizeA].key = bV[i].key; + k[i + kSizeA] = bK[i]; + v[i + kSizeA].key = bV[i].key; v[i + kSizeA].value = bV[i].value; } } @@ -553,7 +570,8 @@ struct BitonicSortStepKVP { // Single warp (N == 1) sorting specialization template struct BitonicSortStepKVP { - static inline __device__ void sort(K k[1], KeyValuePair v[1]) { + static inline __device__ void sort(K k[1], KeyValuePair v[1]) + { // Update this code if this changes // should go from 1 -> kWarpSize in multiples of 2 static_assert(kWarpSize == 32, "unexpected warp size"); @@ -569,61 +587,64 @@ struct BitonicSortStepKVP { /// Sort a list of kWarpSize * N elements in registers, where N is an /// arbitrary >= 1 template -inline __device__ void warpSortAnyRegistersKVP(K k[N], - KeyValuePair v[N]) { +inline __device__ void warpSortAnyRegistersKVP(K k[N], KeyValuePair v[N]) +{ BitonicSortStepKVP::sort(k, v); } // `Dir` true, produce largest values. // `Dir` false, produce smallest values. -template +template struct KeyValueWarpSelect { static constexpr int kNumWarpQRegisters = NumWarpQ / faiss::gpu::kWarpSize; - __device__ inline KeyValueWarpSelect(K initKVal, - faiss::gpu::KeyValuePair initVVal, - int k) + __device__ inline KeyValueWarpSelect(K initKVal, faiss::gpu::KeyValuePair initVVal, int k) : initK(initKVal), initV(initVVal), numVals(0), warpKTop(initKVal), warpKTopRDist(initKVal), - kLane((k - 1) % faiss::gpu::kWarpSize) { - static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock), - "threads must be a power-of-2"); - static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ), - "warp queue must be power-of-2"); + kLane((k - 1) % faiss::gpu::kWarpSize) + { + static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); + static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); // Fill the per-thread queue keys with the default value #pragma unroll for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initV.key; + threadK[i] = initK; + threadV[i].key = initV.key; threadV[i].value = initV.value; } // Fill the warp queue with the default value #pragma unroll for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpK[i] = initK; - warpV[i].key = initV.key; + warpK[i] = initK; + warpV[i].key = initV.key; warpV[i].value = initV.value; } } - __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair& v) { + __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair& v) + { if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { // Rotate right #pragma unroll for (int i = NumThreadQ - 1; i > 0; --i) { - threadK[i] = threadK[i - 1]; - threadV[i].key = threadV[i - 1].key; + threadK[i] = threadK[i - 1]; + threadV[i].key = threadV[i - 1].key; threadV[i].value = threadV[i - 1].value; } - threadK[0] = k; - threadV[0].key = v.key; + threadK[0] = k; + threadV[0].key = v.key; threadV[0].value = v.value; ++numVals; } @@ -633,33 +654,35 @@ struct KeyValueWarpSelect { /// list across both // TODO - __device__ inline void mergeWarpQ() { + __device__ inline void mergeWarpQ() + { // Sort all of the per-thread queues - faiss::gpu::warpSortAnyRegistersKVP(threadK, - threadV); + faiss::gpu::warpSortAnyRegistersKVP(threadK, threadV); // The warp queue is already sorted, and now that we've sorted the // per-thread queue, merge both sorted lists together, producing // one sorted list - faiss::gpu::warpMergeAnyRegistersKVP(warpK, warpV, - threadK, threadV); + faiss::gpu::warpMergeAnyRegistersKVP( + warpK, warpV, threadK, threadV); } /// WARNING: all threads in a warp must participate in this. /// Otherwise, you must call the constituent parts separately. - __device__ inline void add(K k, faiss::gpu::KeyValuePair& v) { + __device__ inline void add(K k, faiss::gpu::KeyValuePair& v) + { addThreadQ(k, v); checkThreadQ(); } - __device__ inline void reduce() { + __device__ inline void reduce() + { // Have all warps dump and merge their queues; this will produce // the final per-warp results mergeWarpQ(); } - __device__ inline void checkThreadQ() { + __device__ inline void checkThreadQ() + { bool needSort = (numVals == NumThreadQ); #if CUDA_VERSION >= 9000 @@ -681,18 +704,19 @@ struct KeyValueWarpSelect { #pragma unroll for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initV.key; + threadK[i] = initK; + threadV[i].key = initV.key; threadV[i].value = initV.value; } // We have to beat at least this element warpKTopRDist = shfl(warpV[kNumWarpQRegisters - 1].key, kLane); - warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane); + warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane); } /// Dump final k selected values for this warp out - __device__ inline void writeOut(K* outK, V* outV, int k) { + __device__ inline void writeOut(K* outK, V* outV, int k) + { int laneId = faiss::gpu::getLaneId(); #pragma unroll diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp index a2e9151dbc..eb9a8f1436 100644 --- a/cpp/include/raft/spatial/knn/knn.hpp +++ b/cpp/include/raft/spatial/knn/knn.hpp @@ -52,12 +52,17 @@ using deviceAllocator = raft::mr::device::allocator; * @param translations */ template -inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, - value_idx *outV, size_t n_samples, int n_parts, - int k, cudaStream_t stream, - value_idx *translations) { - detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, - translations); +inline void knn_merge_parts(value_t* inK, + value_idx* inV, + value_t* outK, + value_idx* outV, + size_t n_samples, + int n_parts, + int k, + cudaStream_t stream, + value_idx* translations) +{ + detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations); } /** @@ -82,9 +87,16 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK, * @param stream */ template -inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, - value_t *outK, value_idx *outV, bool select_min, int k, - cudaStream_t stream) { +inline void select_k(value_t* inK, + value_idx* inV, + size_t n_rows, + size_t n_cols, + value_t* outK, + value_idx* outV, + bool select_min, + int k, + cudaStream_t stream) +{ detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream); } @@ -111,22 +123,41 @@ inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols, * @param[in] translations starting offsets for partitions. should be the same size * as input vector. */ -inline void brute_force_knn( - raft::handle_t const &handle, std::vector &input, - std::vector &sizes, int D, float *search_items, int n, int64_t *res_I, - float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true, - std::vector *translations = nullptr, - distance::DistanceType metric = distance::DistanceType::L2Expanded, - float metric_arg = 2.0f) { - ASSERT(input.size() == sizes.size(), - "input and sizes vectors must be the same size"); +inline void brute_force_knn(raft::handle_t const& handle, + std::vector& input, + std::vector& sizes, + int D, + float* search_items, + int n, + int64_t* res_I, + float* res_D, + int k, + bool rowMajorIndex = true, + bool rowMajorQuery = true, + std::vector* translations = nullptr, + distance::DistanceType metric = distance::DistanceType::L2Expanded, + float metric_arg = 2.0f) +{ + ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size"); std::vector int_streams = handle.get_internal_streams(); - detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D, - k, handle.get_stream(), int_streams.data(), - handle.get_num_internal_streams(), rowMajorIndex, - rowMajorQuery, translations, metric, metric_arg); + detail::brute_force_knn_impl(input, + sizes, + D, + search_items, + n, + res_I, + res_D, + k, + handle.get_stream(), + int_streams.data(), + handle.get_num_internal_streams(), + rowMajorIndex, + rowMajorQuery, + translations, + metric, + metric_arg); } } // namespace knn } // namespace spatial diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp index 6f507331d9..221a9679d4 100644 --- a/cpp/include/raft/spectral/cluster_solvers.hpp +++ b/cpp/include/raft/spectral/cluster_solvers.hpp @@ -24,8 +24,7 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct cluster_solver_config_t { size_type_t n_clusters; size_type_t maxIter; @@ -35,23 +34,35 @@ struct cluster_solver_config_t { unsigned long long seed{123456}; }; -template +template struct kmeans_solver_t { - explicit kmeans_solver_t(cluster_solver_config_t const& config) - : config_(config) {} - - std::pair solve( - handle_t const& handle, size_type_t n_obs_vecs, size_type_t dim, - value_type_t const* __restrict__ obs, - index_type_t* __restrict__ codes) const { + explicit kmeans_solver_t( + cluster_solver_config_t const& config) + : config_(config) + { + } + + std::pair solve(handle_t const& handle, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); value_type_t residual{}; index_type_t iters{}; - kmeans(handle, n_obs_vecs, dim, config_.n_clusters, config_.tol, - config_.maxIter, obs, codes, residual, iters, config_.seed); + kmeans(handle, + n_obs_vecs, + dim, + config_.n_clusters, + config_.tol, + config_.maxIter, + obs, + codes, + residual, + iters, + config_.seed); return std::make_pair(residual, iters); } diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp index e36dca2e0c..156b996586 100644 --- a/cpp/include/raft/spectral/eigen_solvers.hpp +++ b/cpp/include/raft/spectral/eigen_solvers.hpp @@ -23,8 +23,7 @@ using namespace matrix; // aggregate of control params for Eigen Solver: // -template +template struct eigen_solver_config_t { size_type_t n_eigVecs; size_type_t maxIter; @@ -34,42 +33,59 @@ struct eigen_solver_config_t { bool reorthogonalize{false}; unsigned long long seed{ - 1234567}; // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in + // Lanczos; was not the case before: there were places where a default seed = 123456 + // was used; this may trigger slightly different # solver iterations }; -template +template struct lanczos_solver_t { - explicit lanczos_solver_t(eigen_solver_config_t const& config) - : config_(config) {} + explicit lanczos_solver_t( + eigen_solver_config_t const& config) + : config_(config) + { + } - index_type_t solve_smallest_eigenvectors( - handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const { + index_type_t solve_smallest_eigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, - config_.restartIter, config_.tol, - config_.reorthogonalize, iters, eigVals, - eigVecs, config_.seed); + computeSmallestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed); return iters; } - index_type_t solve_largest_eigenvectors( - handle_t const& handle, - sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const { + index_type_t solve_largest_eigenvectors(handle_t const& handle, + sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); index_type_t iters{}; - computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter, - config_.restartIter, config_.tol, - config_.reorthogonalize, iters, eigVals, eigVecs, + computeLargestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, config_.seed); return iters; } diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp index d089b85518..18b23bea55 100644 --- a/cpp/include/raft/spectral/kmeans.hpp +++ b/cpp/include/raft/spectral/kmeans.hpp @@ -43,15 +43,15 @@ using namespace raft::linalg; // Useful grid settings // ========================================================= -constexpr unsigned int BLOCK_SIZE = 1024; -constexpr unsigned int WARP_SIZE = 32; +constexpr unsigned int BLOCK_SIZE = 1024; +constexpr unsigned int WARP_SIZE = 32; constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); // ========================================================= // CUDA kernels // ========================================================= -/** +/** * @brief Compute distances between observation vectors and centroids * Block dimensions should be (warpSize, 1, * blockSize/warpSize). Ideally, the grid is large enough so there @@ -75,11 +75,13 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE); * initialized to zero. */ template -static __global__ void computeDistances( - index_type_t n, index_type_t d, index_type_t k, - const value_type_t* __restrict__ obs, - const value_type_t* __restrict__ centroids, - value_type_t* __restrict__ dists) { +static __global__ void computeDistances(index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + const value_type_t* __restrict__ centroids, + value_type_t* __restrict__ dists) +{ // Loop index index_type_t i; @@ -114,12 +116,10 @@ static __global__ void computeDistances( // Perform reduction on warp for (i = WARP_SIZE / 2; i > 0; i /= 2) - dist_private += - __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); + dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i); // Write result to global memory - if (threadIdx.x == 0) - atomicAdd(dists + IDX(gidz, gidy, n), dist_private); + if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private); // Move to another observation vector gidz += blockDim.z * gridDim.z; @@ -134,8 +134,8 @@ static __global__ void computeDistances( } } -/** - * @brief Find closest centroid to observation vectors. +/** + * @brief Find closest centroid to observation vectors. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -156,10 +156,12 @@ static __global__ void computeDistances( * cluster. Entries must be initialized to zero. */ template -static __global__ void minDistances(index_type_t n, index_type_t k, +static __global__ void minDistances(index_type_t n, + index_type_t k, value_type_t* __restrict__ dists, index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) { + index_type_t* __restrict__ clusterSizes) +{ // Loop index index_type_t i, j; @@ -178,8 +180,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k, dist_min = dists[IDX(i, 0, n)]; for (j = 1; j < k; ++j) { dist_curr = dists[IDX(i, j, n)]; - code_min = (dist_curr < dist_min) ? j : code_min; - dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; + code_min = (dist_curr < dist_min) ? j : code_min; + dist_min = (dist_curr < dist_min) ? dist_curr : dist_min; } // Transfer result to global memory @@ -194,8 +196,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k, } } -/** - * @brief Check if newly computed distances are smaller than old distances. +/** + * @brief Check if newly computed distances are smaller than old distances. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. * @tparam index_type_t the type of data used for indexing. @@ -218,7 +220,8 @@ static __global__ void minDistances2(index_type_t n, value_type_t* __restrict__ dists_old, const value_type_t* __restrict__ dists_new, index_type_t* __restrict__ codes_old, - index_type_t code_new) { + index_type_t code_new) +{ // Loop index index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; @@ -243,7 +246,7 @@ static __global__ void minDistances2(index_type_t n, } } -/** +/** * @brief Compute size of k-means clusters. * Block and grid dimensions should be 1-dimensional. Ideally the * grid is large enough so there are n threads. @@ -255,9 +258,10 @@ static __global__ void minDistances2(index_type_t n, * cluster. Entries must be initialized to zero. */ template -static __global__ void computeClusterSizes( - index_type_t n, const index_type_t* __restrict__ codes, - index_type_t* __restrict__ clusterSizes) { +static __global__ void computeClusterSizes(index_type_t n, + const index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes) +{ index_type_t i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { atomicAdd(clusterSizes + codes[i], 1); @@ -265,8 +269,8 @@ static __global__ void computeClusterSizes( } } -/** - * @brief Divide rows of centroid matrix by cluster sizes. +/** + * @brief Divide rows of centroid matrix by cluster sizes. * Divides the ith column of the sum matrix by the size of the ith * cluster. If the sum matrix has been initialized so that the ith * row is the sum of all observation vectors in the ith cluster, @@ -287,9 +291,11 @@ static __global__ void computeClusterSizes( * column is the mean position of a cluster). */ template -static __global__ void divideCentroids( - index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ centroids) { +static __global__ void divideCentroids(index_type_t d, + index_type_t k, + const index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ centroids) +{ // Global indices index_type_t gidx, gidy; @@ -340,11 +346,14 @@ static __global__ void divideCentroids( * @return Zero if successful. Otherwise non-zero. */ template -static int chooseNewCentroid(handle_t const& handle, index_type_t n, - index_type_t d, value_type_t rand, +static int chooseNewCentroid(handle_t const& handle, + index_type_t n, + index_type_t d, + value_type_t rand, const value_type_t* __restrict__ obs, value_type_t* __restrict__ dists, - value_type_t* __restrict__ centroid) { + value_type_t* __restrict__ centroid) +{ // Cumulative sum of distances value_type_t* distsCumSum = dists + n; // Residual sum of squares @@ -352,44 +361,44 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n, // Observation vector that is chosen as new centroid index_type_t obsIndex; - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto thrust_exec_policy = handle.get_thrust_policy(); // Compute cumulative sum of distances - thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists), + thrust::inclusive_scan(thrust_exec_policy, + thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n), thrust::device_pointer_cast(distsCumSum)); CHECK_CUDA(stream); - CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t), - cudaMemcpyDeviceToHost, stream)); + CUDA_TRY(cudaMemcpyAsync( + &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); // Randomly choose observation vector // Probabilities are proportional to square of distance to closest // centroid (see k-means++ algorithm) // - //seg-faults due to Thrust bug - //on binary-search-like algorithms - //when run with stream dependent - //execution policies; fixed on Thrust GitHub - //hence replace w/ linear interpolation, - //until the Thrust issue gets resolved: + // seg-faults due to Thrust bug + // on binary-search-like algorithms + // when run with stream dependent + // execution policies; fixed on Thrust GitHub + // hence replace w/ linear interpolation, + // until the Thrust issue gets resolved: // // obsIndex = (thrust::lower_bound( // thrust_exec_policy, thrust::device_pointer_cast(distsCumSum), // thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) - // thrust::device_pointer_cast(distsCumSum)); // - //linear interpolation logic: + // linear interpolation logic: //{ value_type_t minSum{0}; - CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), - cudaMemcpyDeviceToHost, stream)); + CUDA_TRY( + cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream)); CHECK_CUDA(stream); if (distsSum > minSum) { value_type_t vIndex = static_cast(n - 1); - obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / - (distsSum - minSum)); + obsIndex = static_cast(vIndex * (distsSum * rand - minSum) / (distsSum - minSum)); } else { obsIndex = 0; } @@ -400,15 +409,17 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n, obsIndex = min(obsIndex, n - 1); // Record new centroid position - CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d), - d * sizeof(value_type_t), cudaMemcpyDeviceToDevice, + CUDA_TRY(cudaMemcpyAsync(centroid, + obs + IDX(0, obsIndex, d), + d * sizeof(value_type_t), + cudaMemcpyDeviceToDevice, stream)); return 0; } /** - * @brief Choose initial cluster centroids for k-means algorithm. + * @brief Choose initial cluster centroids for k-means algorithm. * Centroids are randomly chosen with k-means++ algorithm * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -432,11 +443,17 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n, * @return Zero if successful. Otherwise non-zero. */ template -static int initializeCentroids( - handle_t const& handle, index_type_t n, index_type_t d, index_type_t k, - const value_type_t* __restrict__ obs, value_type_t* __restrict__ centroids, - index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, - value_type_t* __restrict__ dists, unsigned long long seed) { +static int initializeCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + const value_type_t* __restrict__ obs, + value_type_t* __restrict__ centroids, + index_type_t* __restrict__ codes, + index_type_t* __restrict__ clusterSizes, + value_type_t* __restrict__ dists, + unsigned long long seed) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -448,7 +465,7 @@ static int initializeCentroids( thrust::default_random_engine rng(seed); thrust::uniform_real_distribution uniformDist(0, 1); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto thrust_exec_policy = handle.get_thrust_policy(); constexpr index_type_t grid_lower_bound{65535}; @@ -461,35 +478,34 @@ static int initializeCentroids( dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE}; // CUDA grid dimensions - dim3 gridDim_warp{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; + dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)}; // CUDA grid dimensions - dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), - 1, 1}; + dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1}; // Assign observation vectors to code 0 CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); // Choose first centroid - thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n), 1); + thrust::fill(thrust_exec_policy, + thrust::device_pointer_cast(dists), + thrust::device_pointer_cast(dists + n), + 1); CHECK_CUDA(stream); if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids)) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from first centroid CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream)); - computeDistances<<>>( - n, d, 1, obs, centroids, dists); + computeDistances<<>>(n, d, 1, obs, centroids, dists); CHECK_CUDA(stream); // Choose remaining centroids for (i = 1; i < k; ++i) { // Choose ith centroid - if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, - centroids + IDX(0, i, d))) + if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d))) WARNING("error in k-means++ (could not pick centroid)"); // Compute distances from ith centroid @@ -499,22 +515,20 @@ static int initializeCentroids( CHECK_CUDA(stream); // Recompute minimum distances - minDistances2<<>>(n, dists, dists + n, - codes, i); + minDistances2<<>>(n, dists, dists + n, codes, i); CHECK_CUDA(stream); } // Compute cluster sizes CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream)); - computeClusterSizes<<>>(n, codes, - clusterSizes); + computeClusterSizes<<>>(n, codes, clusterSizes); CHECK_CUDA(stream); return 0; } -/** - * @brief Find cluster centroids closest to observation vectors. +/** + * @brief Find cluster centroids closest to observation vectors. * Distance is measured with Euclidean norm. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -540,15 +554,18 @@ static int initializeCentroids( * @return Zero if successful. Otherwise non-zero. */ template -static int assignCentroids(handle_t const& handle, index_type_t n, - index_type_t d, index_type_t k, +static int assignCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, const value_type_t* __restrict__ obs, const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, - value_type_t* residual_host) { - auto stream = handle.get_stream(); + value_type_t* residual_host) +{ + auto stream = handle.get_stream(); auto thrust_exec_policy = handle.get_thrust_policy(); // Compute distance between centroids and observation vectors @@ -561,11 +578,9 @@ static int assignCentroids(handle_t const& handle, index_type_t n, constexpr index_type_t grid_lower_bound{65535}; gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound); gridDim.y = min(k, grid_lower_bound); - gridDim.z = - min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); + gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound); - computeDistances<<>>(n, d, k, obs, centroids, - dists); + computeDistances<<>>(n, d, k, obs, centroids, dists); CHECK_CUDA(stream); // Find centroid closest to each observation vector @@ -573,23 +588,21 @@ static int assignCentroids(handle_t const& handle, index_type_t n, blockDim.x = BLOCK_SIZE; blockDim.y = 1; blockDim.z = 1; - gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); - gridDim.y = 1; - gridDim.z = 1; - minDistances<<>>(n, k, dists, codes, - clusterSizes); + gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound); + gridDim.y = 1; + gridDim.z = 1; + minDistances<<>>(n, k, dists, codes, clusterSizes); CHECK_CUDA(stream); // Compute residual sum of squares - *residual_host = - thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists), - thrust::device_pointer_cast(dists + n)); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n)); return 0; } -/** - * @brief Update cluster centroids for k-means algorithm. +/** + * @brief Update cluster centroids for k-means algorithm. * All clusters are assumed to be non-empty. * @tparam index_type_t the type of data used for indexing. * @tparam value_type_t the type of data used for weights, distances. @@ -613,26 +626,29 @@ static int assignCentroids(handle_t const& handle, index_type_t n, * @return Zero if successful. Otherwise non-zero. */ template -static int updateCentroids(handle_t const& handle, index_type_t n, - index_type_t d, index_type_t k, +static int updateCentroids(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, const value_type_t* __restrict__ obs, const index_type_t* __restrict__ codes, const index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, value_type_t* __restrict__ work, - index_type_t* __restrict__ work_int) { + index_type_t* __restrict__ work_int) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- // Useful constants - const value_type_t one = 1; + const value_type_t one = 1; const value_type_t zero = 0; constexpr index_type_t grid_lower_bound{65535}; - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); // Device memory @@ -641,34 +657,56 @@ static int updateCentroids(handle_t const& handle, index_type_t n, thrust::device_ptr rows(work_int + d * n); // Take transpose of observation matrix - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs, - d, &zero, (value_type_t*)NULL, n, - thrust::raw_pointer_cast(obs_copy), n, stream)); + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + n, + d, + &one, + obs, + d, + &zero, + (value_type_t*)NULL, + n, + thrust::raw_pointer_cast(obs_copy), + n, + stream)); // Cluster assigned to each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, thrust::modulus()); CHECK_CUDA(stream); - thrust::gather(thrust_exec_policy, rows, rows + d * n, - thrust::device_pointer_cast(codes), codes_copy); + thrust::gather( + thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy); CHECK_CUDA(stream); // Row associated with each observation matrix entry thrust::sequence(thrust_exec_policy, rows, rows + d * n); CHECK_CUDA(stream); - thrust::transform(thrust_exec_policy, rows, rows + d * n, - thrust::make_constant_iterator(n), rows, + thrust::transform(thrust_exec_policy, + rows, + rows + d * n, + thrust::make_constant_iterator(n), + rows, thrust::divides()); CHECK_CUDA(stream); // Sort and reduce to add observation vectors in same cluster - thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n, + thrust::stable_sort_by_key(thrust_exec_policy, + codes_copy, + codes_copy + d * n, make_zip_iterator(make_tuple(obs_copy, rows))); CHECK_CUDA(stream); - thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy, + thrust::reduce_by_key(thrust_exec_policy, + rows, + rows + d * n, + obs_copy, codes_copy, // Output to codes_copy is ignored thrust::device_pointer_cast(centroids)); CHECK_CUDA(stream); @@ -679,12 +717,11 @@ static int updateCentroids(handle_t const& handle, index_type_t n, dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1}; // CUDA grid dimensions - dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), - min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1}; + dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), + 1}; - divideCentroids<<>>(d, k, clusterSizes, - centroids); + divideCentroids<<>>(d, k, clusterSizes, centroids); CHECK_CUDA(stream); return 0; @@ -698,8 +735,8 @@ namespace raft { // k-means algorithm // ========================================================= -/** - * @brief Find clusters with k-means algorithm. +/** + * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. @@ -735,15 +772,22 @@ namespace raft { * @return error flag. */ template -int kmeans(handle_t const& handle, index_type_t n, index_type_t d, - index_type_t k, value_type_t tol, index_type_t maxiter, +int kmeans(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ centroids, - value_type_t* __restrict__ work, index_type_t* __restrict__ work_int, - value_type_t* residual_host, index_type_t* iters_host, - unsigned long long seed) { + value_type_t* __restrict__ work, + index_type_t* __restrict__ work_int, + value_type_t* residual_host, + index_type_t* iters_host, + unsigned long long seed) +{ // ------------------------------------------------------- // Variable declarations // ------------------------------------------------------- @@ -764,101 +808,93 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d, // Initialization // ------------------------------------------------------- - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); // Trivial cases if (k == 1) { CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), - cudaMemcpyHostToDevice, stream)); - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, - work, work_int)) + CUDA_TRY( + cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream)); + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not compute k-means centroids"); dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE}; dim3 gridDim{ - min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1, - min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), - grid_lower_bound)}; + min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), + 1, + min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)}; CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream)); - computeDistances<<>>(n, d, 1, obs, centroids, - work); + computeDistances<<>>(n, d, 1, obs, centroids, work); CHECK_CUDA(stream); - *residual_host = - thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work), - thrust::device_pointer_cast(work + n)); + *residual_host = thrust::reduce( + thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n)); CHECK_CUDA(stream); return 0; } if (n <= k) { - thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes), + thrust::sequence(thrust_exec_policy, + thrust::device_pointer_cast(codes), thrust::device_pointer_cast(codes + n)); CHECK_CUDA(stream); - thrust::fill_n(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), n, 1); + thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1); CHECK_CUDA(stream); if (n < k) - CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, - (k - n) * sizeof(index_type_t), stream)); - CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t), - cudaMemcpyDeviceToDevice, stream)); + CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream)); + CUDA_TRY(cudaMemcpyAsync( + centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream)); *residual_host = 0; return 0; } // Initialize cuBLAS - CUBLAS_CHECK( - linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // ------------------------------------------------------- // k-means++ algorithm // ------------------------------------------------------- // Choose initial cluster centroids - if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, - work, seed)) + if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed)) WARNING("could not initialize k-means centroids"); // Apply k-means iteration until convergence for (iter = 0; iter < maxiter; ++iter) { // Update cluster centroids - if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, - work, work_int)) + if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int)) WARNING("could not update k-means centroids"); // Determine centroid closest to each observation residualPrev = *residual_host; - if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, - clusterSizes, residual_host)) + if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); // Reinitialize empty clusters with new centroids - index_type_t emptyCentroid = - (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), 0) - - thrust::device_pointer_cast(clusterSizes)); + index_type_t emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); // FIXME: emptyCentroid never reaches k (infinite loop) under certain // conditions, such as if obs is corrupt (as seen as a result of a // DataFrame column of NULL edge vals used to create the Graph) while (emptyCentroid < k) { - if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, work, - centroids + IDX(0, emptyCentroid, d))) + if (chooseNewCentroid( + handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d))) WARNING("could not replace empty centroid"); - if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, - clusterSizes, residual_host)) + if (assignCentroids( + handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host)) WARNING("could not assign observation vectors to k-means clusters"); - emptyCentroid = - (thrust::find(thrust_exec_policy, - thrust::device_pointer_cast(clusterSizes), - thrust::device_pointer_cast(clusterSizes + k), 0) - - thrust::device_pointer_cast(clusterSizes)); + emptyCentroid = (thrust::find(thrust_exec_policy, + thrust::device_pointer_cast(clusterSizes), + thrust::device_pointer_cast(clusterSizes + k), + 0) - + thrust::device_pointer_cast(clusterSizes)); CHECK_CUDA(stream); } @@ -870,14 +906,13 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d, } // Warning if k-means has failed to converge - if (std::fabs(residualPrev - (*residual_host)) / n >= tol) - WARNING("k-means failed to converge"); + if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge"); *iters_host = iter; return 0; } -/** +/** * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with @@ -903,11 +938,18 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d, * @return error flag */ template -int kmeans(handle_t const& handle, index_type_t n, index_type_t d, - index_type_t k, value_type_t tol, index_type_t maxiter, +int kmeans(handle_t const& handle, + index_type_t n, + index_type_t d, + index_type_t k, + value_type_t tol, + index_type_t maxiter, const value_type_t* __restrict__ obs, - index_type_t* __restrict__ codes, value_type_t& residual, - index_type_t& iters, unsigned long long seed = 123456) { + index_type_t* __restrict__ codes, + value_type_t& residual, + index_type_t& iters, + unsigned long long seed = 123456) +{ using namespace matrix; // Check that parameters are valid @@ -924,9 +966,21 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d, vector_t work_int(handle, 2 * d * n); // Perform k-means - return kmeans( - handle, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(), - centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed); + return kmeans(handle, + n, + d, + k, + tol, + maxiter, + obs, + codes, + clusterSizes.raw(), + centroids.raw(), + work.raw(), + work_int.raw(), + &residual, + &iters, + seed); } } // namespace raft diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp index d14bf05f37..35fc22c770 100644 --- a/cpp/include/raft/spectral/lapack.hpp +++ b/cpp/include/raft/spectral/lapack.hpp @@ -21,66 +21,125 @@ #include #include -//for now; TODO: check if/where this `define` should be; +// for now; TODO: check if/where this `define` should be; // #define USE_LAPACK namespace raft { -#define lapackCheckError(status) \ - { \ - if (status < 0) { \ - std::stringstream ss; \ - ss << "Lapack error: argument number " << -status \ - << " had an illegal value."; \ - throw exception(ss.str()); \ - } else if (status > 0) \ - RAFT_FAIL("Lapack error: internal error."); \ +#define lapackCheckError(status) \ + { \ + if (status < 0) { \ + std::stringstream ss; \ + ss << "Lapack error: argument number " << -status << " had an illegal value."; \ + throw exception(ss.str()); \ + } else if (status > 0) \ + RAFT_FAIL("Lapack error: internal error."); \ } -extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, - float *work, int *lwork, int *info); -extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, - double *work, int *lwork, int *info); -extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, - float *a, int *lda, const float *tau, float *c, - int *ldc, float *work, int *lwork, int *info); -extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, - double *a, int *lda, const double *tau, double *c, - int *ldc, double *work, int *lwork, int *info); -extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, - double *wr, double *wi, double *vl, int *ldvl, double *vr, - int *ldvr, double *work, int *lwork, int *info); - -extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, - float *wr, float *wi, float *vl, int *ldvl, float *vr, - int *ldvr, float *work, int *lwork, int *info); - -extern "C" cusolverStatus_t cusolverDnSgemmHost( - cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const float *alpha, const float *A, int lda, const float *B, int ldb, - const float *beta, float *C, int ldc); - -extern "C" cusolverStatus_t cusolverDnDgemmHost( - cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const double *alpha, const double *A, int lda, const double *B, int ldb, - const double *beta, double *C, int ldc); - -extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e, - int *info); - -extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e, - int *info); - -extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz, - int n, float *d, float *e, - float *z, int ldz, float *work, - int *info); - -extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz, - int n, double *d, double *e, - double *z, int ldz, - double *work, int *info); +extern "C" void sgeqrf_( + int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info); +extern "C" void dgeqrf_( + int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info); +extern "C" void sormqr_(char* side, + char* trans, + int* m, + int* n, + int* k, + float* a, + int* lda, + const float* tau, + float* c, + int* ldc, + float* work, + int* lwork, + int* info); +extern "C" void dormqr_(char* side, + char* trans, + int* m, + int* n, + int* k, + double* a, + int* lda, + const double* tau, + double* c, + int* ldc, + double* work, + int* lwork, + int* info); +extern "C" int dgeev_(char* jobvl, + char* jobvr, + int* n, + double* a, + int* lda, + double* wr, + double* wi, + double* vl, + int* ldvl, + double* vr, + int* ldvr, + double* work, + int* lwork, + int* info); + +extern "C" int sgeev_(char* jobvl, + char* jobvr, + int* n, + float* a, + int* lda, + float* wr, + float* wi, + float* vl, + int* ldvl, + float* vr, + int* ldvr, + float* work, + int* lwork, + int* info); + +extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc); + +extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, + double* C, + int ldc); + +extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info); + +extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info); + +extern "C" cusolverStatus_t cusolverDnSsteqrHost( + const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info); + +extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz, + int n, + double* d, + double* e, + double* z, + int ldz, + double* work, + int* info); template class Lapack { @@ -91,182 +150,339 @@ class Lapack { public: static void check_lapack_enabled(); - static void gemm(bool transa, bool transb, int m, int n, int k, T alpha, - const T *A, int lda, const T *B, int ldb, T beta, T *C, + static void gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T* A, + int lda, + const T* B, + int ldb, + T beta, + T* C, int ldc); // special QR for lanczos - static void sterf(int n, T *d, T *e); - static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work); + static void sterf(int n, T* d, T* e); + static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work); // QR // computes the QR factorization of a general matrix - static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork); + static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork); // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. // multiply C by implicit Q - static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a, - int lda, T *tau, T *c, int ldc, T *work, int *lwork); - - static void geev(T *A, T *eigenvalues, int dim, int lda); - static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, + static void ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T* a, + int lda, + T* tau, + T* c, + int ldc, + T* work, + int* lwork); + + static void geev(T* A, T* eigenvalues, int dim, int lda); + static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); + static void geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, int ldvr); - static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r, - T *eigenvectors_i, int dim, int lda, int ldvr); private: - static void lapack_gemm(const char transa, const char transb, int m, int n, - int k, float alpha, const float *a, int lda, - const float *b, int ldb, float beta, float *c, - int ldc) { - cublasOperation_t cublas_transa = - (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = - (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, - (float *)a, lda, (float *)b, ldb, &beta, c, ldc); + static void lapack_gemm(const char transa, + const char transb, + int m, + int n, + int k, + float alpha, + const float* a, + int lda, + const float* b, + int ldb, + float beta, + float* c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnSgemmHost( + cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc); } - static void lapack_gemm(const signed char transa, const signed char transb, - int m, int n, int k, double alpha, const double *a, - int lda, const double *b, int ldb, double beta, - double *c, int ldc) { - cublasOperation_t cublas_transa = - (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = - (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha, - (double *)a, lda, (double *)b, ldb, &beta, c, ldc); + static void lapack_gemm(const signed char transa, + const signed char transb, + int m, + int n, + int k, + double alpha, + const double* a, + int lda, + const double* b, + int ldb, + double beta, + double* c, + int ldc) + { + cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; + cusolverDnDgemmHost(cublas_transa, + cublas_transb, + m, + n, + k, + &alpha, + (double*)a, + lda, + (double*)b, + ldb, + &beta, + c, + ldc); } - static void lapack_sterf(int n, float *d, float *e, int *info) { + static void lapack_sterf(int n, float* d, float* e, int* info) + { cusolverDnSsterfHost(n, d, e, info); } - static void lapack_sterf(int n, double *d, double *e, int *info) { + static void lapack_sterf(int n, double* d, double* e, int* info) + { cusolverDnDsterfHost(n, d, e, info); } - static void lapack_steqr(const signed char compz, int n, float *d, float *e, - float *z, int ldz, float *work, int *info) { + static void lapack_steqr( + const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info) + { cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_steqr(const signed char compz, int n, double *d, double *e, - double *z, int ldz, double *work, int *info) { + static void lapack_steqr(const signed char compz, + int n, + double* d, + double* e, + double* z, + int ldz, + double* work, + int* info) + { cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); } - static void lapack_geqrf(int m, int n, float *a, int lda, float *tau, - float *work, int *lwork, int *info) { + static void lapack_geqrf( + int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info) + { sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_geqrf(int m, int n, double *a, int lda, double *tau, - double *work, int *lwork, int *info) { + static void lapack_geqrf( + int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info) + { dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); } - static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a, - int lda, float *tau, float *c, int ldc, float *work, - int *lwork, int *info) { - sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, - info); + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + float* a, + int lda, + float* tau, + float* c, + int ldc, + float* work, + int* lwork, + int* info) + { + sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); } - static void lapack_ormqr(char side, char trans, int m, int n, int k, - double *a, int lda, double *tau, double *c, int ldc, - double *work, int *lwork, int *info) { - dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, - info); + static void lapack_ormqr(char side, + char trans, + int m, + int n, + int k, + double* a, + int lda, + double* tau, + double* c, + int ldc, + double* work, + int* lwork, + int* info) + { + dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); } - static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a, - int *lda, double *wr, double *wi, double *vl, - int *ldvl, double *vr, int *ldvr, - double *work, int *lwork, int *info) { - return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, - lwork, info); + static int lapack_geev_dispatch(char* jobvl, + char* jobvr, + int* n, + double* a, + int* lda, + double* wr, + double* wi, + double* vl, + int* ldvl, + double* vr, + int* ldvr, + double* work, + int* lwork, + int* info) + { + return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); } - static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a, - int *lda, float *wr, float *wi, float *vl, - int *ldvl, float *vr, int *ldvr, float *work, - int *lwork, int *info) { - return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, - lwork, info); + static int lapack_geev_dispatch(char* jobvl, + char* jobvr, + int* n, + float* a, + int* lda, + float* wr, + float* wi, + float* vl, + int* ldvl, + float* vr, + int* ldvr, + float* work, + int* lwork, + int* info) + { + return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); } // real eigenvalues - static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) { + static void lapack_geev(T* A, T* eigenvalues, int dim, int lda) + { char job = 'N'; std::vector WI(dim); - int ldv = 1; - T *vl = 0; + int ldv = 1; + T* vl = 0; int work_size = 6 * dim; std::vector work(work_size); int info; - lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl, - &ldv, vl, &ldv, work.data(), &work_size, &info); + lapack_geev_dispatch(&job, + &job, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldv, + vl, + &ldv, + work.data(), + &work_size, + &info); lapackCheckError(info); } // real eigenpairs - static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim, - int lda, int ldvr) { + static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) + { char jobvl = 'N'; char jobvr = 'V'; std::vector WI(dim); int work_size = 6 * dim; - T *vl = 0; - int ldvl = 1; + T* vl = 0; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(), - vl, &ldvl, eigenvectors, &ldvr, work.data(), - &work_size, &info); + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues, + WI.data(), + vl, + &ldvl, + eigenvectors, + &ldvr, + work.data(), + &work_size, + &info); lapackCheckError(info); } // complex eigenpairs - static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i, - T *eigenvectors_r, T *eigenvectors_i, int dim, - int lda, int ldvr) { - char jobvl = 'N'; - char jobvr = 'V'; + static void lapack_geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, + int ldvr) + { + char jobvl = 'N'; + char jobvr = 'V'; int work_size = 8 * dim; - int ldvl = 1; + int ldvl = 1; std::vector work(work_size); int info; - lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r, - eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr, - work.data(), &work_size, &info); + lapack_geev_dispatch(&jobvl, + &jobvr, + &dim, + A, + &lda, + eigenvalues_r, + eigenvalues_i, + 0, + &ldvl, + eigenvectors_r, + &ldvr, + work.data(), + &work_size, + &info); lapackCheckError(info); } }; template -void Lapack::check_lapack_enabled() { +void Lapack::check_lapack_enabled() +{ #ifndef USE_LAPACK RAFT_FAIL("Error: LAPACK not enabled."); #endif } template -void Lapack::gemm(bool transa, bool transb, int m, int n, int k, T alpha, - const T *A, int lda, const T *B, int ldb, T beta, T *C, - int ldc) { +void Lapack::gemm(bool transa, + bool transb, + int m, + int n, + int k, + T alpha, + const T* A, + int lda, + const T* B, + int ldb, + T beta, + T* C, + int ldc) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK const char transA_char = transa ? 'T' : 'N'; const char transB_char = transb ? 'T' : 'N'; - lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, - ldc); + lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); //#endif } template -void Lapack::sterf(int n, T *d, T *e) { +void Lapack::sterf(int n, T* d, T* e) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -276,7 +492,8 @@ void Lapack::sterf(int n, T *d, T *e) { } template -void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { +void Lapack::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work) +{ // check_lapack_enabled(); //#ifdef NVGRAPH_USE_LAPACK int info; @@ -286,8 +503,8 @@ void Lapack::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) { } template -void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, - int *lwork) { +void Lapack::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork) +{ check_lapack_enabled(); #ifdef USE_LAPACK int info; @@ -296,11 +513,22 @@ void Lapack::geqrf(int m, int n, T *a, int lda, T *tau, T *work, #endif } template -void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, - int lda, T *tau, T *c, int ldc, T *work, int *lwork) { +void Lapack::ormqr(bool right_side, + bool transq, + int m, + int n, + int k, + T* a, + int lda, + T* tau, + T* c, + int ldc, + T* work, + int* lwork) +{ check_lapack_enabled(); #ifdef USE_LAPACK - char side = right_side ? 'R' : 'L'; + char side = right_side ? 'R' : 'L'; char trans = transq ? 'T' : 'N'; int info; lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); @@ -310,7 +538,8 @@ void Lapack::ormqr(bool right_side, bool transq, int m, int n, int k, T *a, // real eigenvalues template -void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { +void Lapack::geev(T* A, T* eigenvalues, int dim, int lda) +{ check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, dim, lda); @@ -318,8 +547,8 @@ void Lapack::geev(T *A, T *eigenvalues, int dim, int lda) { } // real eigenpairs template -void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, - int ldvr) { +void Lapack::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) +{ check_lapack_enabled(); #ifdef USE_LAPACK lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); @@ -327,13 +556,18 @@ void Lapack::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda, } // complex eigenpairs template -void Lapack::geev(T *A, T *eigenvalues_r, T *eigenvalues_i, - T *eigenvectors_r, T *eigenvectors_i, int dim, int lda, - int ldvr) { +void Lapack::geev(T* A, + T* eigenvalues_r, + T* eigenvalues_i, + T* eigenvectors_r, + T* eigenvectors_i, + int dim, + int lda, + int ldvr) +{ check_lapack_enabled(); #ifdef USE_LAPACK - lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, - dim, lda, ldvr); + lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); #endif } diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp index 42fc621a1a..9d1f899d66 100644 --- a/cpp/include/raft/spectral/matrix_wrappers.hpp +++ b/cpp/include/raft/spectral/matrix_wrappers.hpp @@ -41,10 +41,12 @@ using size_type = int; // for now; TODO: move it in appropriate header // Apply diagonal matrix to vector: // template -static __global__ void diagmv(IndexType_ n, ValueType_ alpha, +static __global__ void diagmv(IndexType_ n, + ValueType_ alpha, const ValueType_* __restrict__ D, const ValueType_* __restrict__ x, - ValueType_* __restrict__ y) { + ValueType_* __restrict__ y) +{ IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; while (i < n) { y[i] += alpha * D[i] * x[i]; @@ -59,7 +61,7 @@ enum struct sparse_mv_alg_t : int { SPARSE_MV_UNDEFINED = -1, SPARSE_MV_ALG_DEFAULT, // generic, for any sparse matrix SPARSE_MV_ALG1, // typical for CSR - SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices + SPARSE_MV_ALG2 // may provide better performamce for irregular sparse matrices }; // Vector "view"-like aggregate for linear algebra purposes @@ -69,15 +71,14 @@ struct vector_view_t { value_type* buffer_; size_type size_; - vector_view_t(value_type* buffer, size_type sz) - : buffer_(buffer), size_(sz) {} + vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {} - vector_view_t(vector_view_t&& other) - : buffer_(other.raw()), size_(other.size()) {} + vector_view_t(vector_view_t&& other) : buffer_(other.raw()), size_(other.size()) {} - vector_view_t& operator=(vector_view_t&& other) { + vector_view_t& operator=(vector_view_t&& other) + { buffer_ = other.raw(); - size_ = other.size(); + size_ = other.size(); } }; @@ -85,8 +86,9 @@ template class vector_t { public: vector_t(handle_t const& raft_handle, size_type sz) - : buffer_(sz, raft_handle.get_stream()), - thrust_policy(raft_handle.get_thrust_policy()) {} + : buffer_(sz, raft_handle.get_stream()), thrust_policy(raft_handle.get_thrust_policy()) + { + } size_type size(void) const { return buffer_.size(); } @@ -94,32 +96,40 @@ class vector_t { value_type const* raw(void) const { return buffer_.data(); } - value_type nrm1() const { - return thrust::reduce(thrust_policy, buffer_.data(), - buffer_.data() + buffer_.size(), value_type{0}, + value_type nrm1() const + { + return thrust::reduce(thrust_policy, + buffer_.data(), + buffer_.data() + buffer_.size(), + value_type{0}, [] __device__(auto left, auto right) { - auto abs_left = left > 0 ? left : -left; + auto abs_left = left > 0 ? left : -left; auto abs_right = right > 0 ? right : -right; return abs_left + abs_right; }); } - void fill(value_type value) { + void fill(value_type value) + { thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value); } private: - using thrust_exec_policy_t = thrust::detail::execute_with_allocator< - rmm::mr::thrust_allocator, thrust::cuda_cub::execute_on_stream_base>; + using thrust_exec_policy_t = + thrust::detail::execute_with_allocator, + thrust::cuda_cub::execute_on_stream_base>; rmm::device_uvector buffer_; const thrust_exec_policy_t thrust_policy; }; template struct sparse_matrix_t { - sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const ncols, + sparse_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const ncols, index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), @@ -127,18 +137,25 @@ struct sparse_matrix_t { values_(values), nrows_(nrows), ncols_(ncols), - nnz_(nnz) {} + nnz_(nnz) + { + } - sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) + sparse_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) : handle_(raft_handle), row_offsets_(row_offsets), col_indices_(col_indices), values_(values), nrows_(nrows), ncols_(nrows), - nnz_(nnz) {} + nnz_(nnz) + { + } template sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view) @@ -148,7 +165,9 @@ struct sparse_matrix_t { values_(csr_view.edge_data), nrows_(csr_view.number_of_vertices), ncols_(csr_view.number_of_vertices), - nnz_(csr_view.number_of_edges) {} + nnz_(csr_view.number_of_edges) + { + } virtual ~sparse_matrix_t(void) = default; // virtual because used as base for following matrix types @@ -158,21 +177,24 @@ struct sparse_matrix_t { // descriptor creation works with non-const, and const-casting // down is dangerous) // - virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + virtual void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const { + bool transpose = false, + bool symmetric = false) const + { using namespace sparse; RAFT_EXPECTS(x != nullptr, "Null x buffer."); RAFT_EXPECTS(y != nullptr, "Null y buffer."); auto cusparse_h = handle_.get_cusparse_handle(); - auto stream = handle_.get_stream(); + auto stream = handle_.get_stream(); - cusparseOperation_t trans = - transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose - CUSPARSE_OPERATION_NON_TRANSPOSE; //non-transpose + cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose + CUSPARSE_OPERATION_NON_TRANSPOSE; // non-transpose #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP auto size_x = transpose ? nrows_ : ncols_; @@ -180,15 +202,19 @@ struct sparse_matrix_t { cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg); - //create descriptors: + // create descriptors: //(below casts are necessary, because // cusparseCreateCsr(...) takes non-const // void*; the casts should be harmless) // cusparseSpMatDescr_t matA; - CUSPARSE_CHECK(cusparsecreatecsr( - &matA, nrows_, ncols_, nnz_, const_cast(row_offsets_), - const_cast(col_indices_), const_cast(values_))); + CUSPARSE_CHECK(cusparsecreatecsr(&matA, + nrows_, + ncols_, + nnz_, + const_cast(row_offsets_), + const_cast(col_indices_), + const_cast(values_))); cusparseDnVecDescr_t vecX; CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x)); @@ -196,31 +222,29 @@ struct sparse_matrix_t { cusparseDnVecDescr_t vecY; CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y)); - //get (scratch) external device buffer size: + // get (scratch) external device buffer size: // size_t bufferSize; - CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA, - vecX, &beta, vecY, spmv_alg, - &bufferSize, stream)); + CUSPARSE_CHECK(cusparsespmv_buffersize( + cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream)); - //allocate external buffer: + // allocate external buffer: // vector_t external_buffer(handle_, bufferSize); - //finally perform SpMV: + // finally perform SpMV: // - CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta, - vecY, spmv_alg, external_buffer.raw(), stream)); + CUSPARSE_CHECK(cusparsespmv( + cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream)); - //free descriptors: + // free descriptors: //(TODO: maybe wrap them in a RAII struct?) // CUSPARSE_CHECK(cusparseDestroyDnVec(vecY)); CUSPARSE_CHECK(cusparseDestroyDnVec(vecX)); CUSPARSE_CHECK(cusparseDestroySpMat(matA)); #else - CUSPARSE_CHECK( - cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); + CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); cusparseMatDescr_t descr = 0; CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); if (symmetric) { @@ -229,9 +253,20 @@ struct sparse_matrix_t { CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); } CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_, - &alpha, descr, values_, row_offsets_, - col_indices_, x, &beta, y, stream)); + CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, + trans, + nrows_, + ncols_, + nnz_, + &alpha, + descr, + values_, + row_offsets_, + col_indices_, + x, + &beta, + y, + stream)); CUSPARSE_CHECK(cusparseDestroyMatDescr(descr)); #endif } @@ -239,19 +274,18 @@ struct sparse_matrix_t { handle_t const& get_handle(void) const { return handle_; } #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP - cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const { + cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const + { switch (alg) { - case sparse_mv_alg_t::SPARSE_MV_ALG1: - return CUSPARSE_CSRMV_ALG1; - case sparse_mv_alg_t::SPARSE_MV_ALG2: - return CUSPARSE_CSRMV_ALG2; - default: - return CUSPARSE_MV_ALG_DEFAULT; + case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_CSRMV_ALG1; + case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_CSRMV_ALG2; + default: return CUSPARSE_MV_ALG_DEFAULT; } } #endif - //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate + // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, + // aggregate handle_t const& handle_; index_type const* row_offsets_; @@ -264,43 +298,51 @@ struct sparse_matrix_t { template struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) - : sparse_matrix_t(raft_handle, row_offsets, - col_indices, values, nrows, nnz), - diagonal_(raft_handle, nrows) { + laplacian_matrix_t(handle_t const& raft_handle, + index_type const* row_offsets, + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) + : sparse_matrix_t( + raft_handle, row_offsets, col_indices, values, nrows, nnz), + diagonal_(raft_handle, nrows) + { vector_t ones{raft_handle, nrows}; ones.fill(1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, - diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } laplacian_matrix_t(handle_t const& raft_handle, sparse_matrix_t const& csr_m) - : sparse_matrix_t(raft_handle, csr_m.row_offsets_, - csr_m.col_indices_, csr_m.values_, - csr_m.nrows_, csr_m.nnz_), - diagonal_(raft_handle, csr_m.nrows_) { + : sparse_matrix_t(raft_handle, + csr_m.row_offsets_, + csr_m.col_indices_, + csr_m.values_, + csr_m.nrows_, + csr_m.nnz_), + diagonal_(raft_handle, csr_m.nrows_) + { vector_t ones{raft_handle, csr_m.nrows_}; ones.fill(1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, - diagonal_.raw()); + sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const override { + bool transpose = false, + bool symmetric = false) const override + { constexpr int BLOCK_SIZE = 1024; - auto n = sparse_matrix_t::nrows_; + auto n = sparse_matrix_t::nrows_; - auto cublas_h = - sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = - sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = sparse_matrix_t::get_handle().get_stream(); // scales y by beta: // @@ -312,8 +354,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply diagonal matrix // - dim3 gridDim{ - std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; + dim3 gridDim{std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; dim3 blockDim{BLOCK_SIZE, 1, 1}; diagmv<<>>(n, alpha, diagonal_.raw(), x, y); @@ -321,8 +362,7 @@ struct laplacian_matrix_t : sparse_matrix_t { // Apply adjacency matrix // - sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, - symmetric); + sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, symmetric); } vector_t diagonal_; @@ -332,52 +372,66 @@ template struct modularity_matrix_t : laplacian_matrix_t { modularity_matrix_t(handle_t const& raft_handle, index_type const* row_offsets, - index_type const* col_indices, value_type const* values, - index_type const nrows, index_type const nnz) + index_type const* col_indices, + value_type const* values, + index_type const nrows, + index_type const nnz) : laplacian_matrix_t( - raft_handle, row_offsets, col_indices, values, nrows, nnz) { + raft_handle, row_offsets, col_indices, values, nrows, nnz) + { edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } modularity_matrix_t(handle_t const& raft_handle, sparse_matrix_t const& csr_m) - : laplacian_matrix_t(raft_handle, csr_m) { + : laplacian_matrix_t(raft_handle, csr_m) + { edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } // y = alpha*A*x + beta*y // - void mv(value_type alpha, value_type* __restrict__ x, value_type beta, + void mv(value_type alpha, + value_type* __restrict__ x, + value_type beta, value_type* __restrict__ y, sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, bool symmetric = false) const override { + bool transpose = false, + bool symmetric = false) const override + { auto n = sparse_matrix_t::nrows_; - auto cublas_h = - sparse_matrix_t::get_handle().get_cublas_handle(); - auto stream = - sparse_matrix_t::get_handle().get_stream(); + auto cublas_h = sparse_matrix_t::get_handle().get_cublas_handle(); + auto stream = sparse_matrix_t::get_handle().get_stream(); // y = A*x // - sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, - symmetric); + sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, symmetric); value_type dot_res; // gamma = d'*x // // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - CUBLAS_CHECK(linalg::cublasdot( - cublas_h, n, laplacian_matrix_t::diagonal_.raw(), - 1, x, 1, &dot_res, stream)); + CUBLAS_CHECK(linalg::cublasdot(cublas_h, + n, + laplacian_matrix_t::diagonal_.raw(), + 1, + x, + 1, + &dot_res, + stream)); // y = y -(gamma/edge_sum)*d // value_type gamma_ = -dot_res / edge_sum_; - CUBLAS_CHECK(linalg::cublasaxpy( - cublas_h, n, &gamma_, - laplacian_matrix_t::diagonal_.raw(), 1, y, 1, - stream)); + CUBLAS_CHECK(linalg::cublasaxpy(cublas_h, + n, + &gamma_, + laplacian_matrix_t::diagonal_.raw(), + 1, + y, + 1, + stream)); } value_type edge_sum_; diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp index fededbfcb4..0e0e47ddf3 100644 --- a/cpp/include/raft/spectral/modularity_maximization.hpp +++ b/cpp/include/raft/spectral/modularity_maximization.hpp @@ -39,7 +39,8 @@ #endif #ifdef COLLECT_TIME_STATISTICS -static double timer(void) { +static double timer(void) +{ struct timeval tv; cudaDeviceSynchronize(); gettimeofday(&tv, NULL); @@ -78,17 +79,21 @@ using namespace linalg; * performed. * @return error flag. */ -template +template std::tuple modularity_maximization( - handle_t const &handle, sparse_matrix_t const &csr_m, - EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { + handle_t const& handle, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto cublas_h = handle.get_cublas_handle(); std::tuple @@ -102,11 +107,10 @@ std::tuple modularity_maximization( modularity_matrix_t B{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute eigenvectors corresponding to largest eigenvalues - std::get<0>(stats) = - eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); + std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, n, nEigVecs, eigVecs); @@ -117,8 +121,7 @@ std::tuple modularity_maximization( CHECK_CUDA(stream); // Find partition clustering - auto pair_cluster = - cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -137,11 +140,12 @@ std::tuple modularity_maximization( * @param modularity On exit, modularity */ template -void analyzeModularity(handle_t const &handle, - sparse_matrix_t const &csr_m, +void analyzeModularity(handle_t const& handle, + sparse_matrix_t const& csr_m, vertex_t nClusters, - vertex_t const *__restrict__ clusters, - weight_t &modularity) { + vertex_t const* __restrict__ clusters, + weight_t& modularity) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; @@ -149,15 +153,14 @@ void analyzeModularity(handle_t const &handle, weight_t partModularity, clustersize; auto cublas_h = handle.get_cublas_handle(); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); // Device memory vector_t part_i(handle, n); vector_t Bx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Modularity modularity_matrix_t B{handle, csr_m}; @@ -167,8 +170,7 @@ void analyzeModularity(handle_t const &handle, // Iterate through partitions for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, i, n, clustersize, partModularity, - clusters, part_i, Bx, B)) { + if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 2df3812a4a..88cc8aa8f0 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -61,21 +61,25 @@ using namespace linalg; * performed. * @return statistics: number of eigensolver iterations, . */ -template -std::tuple partition( - handle_t const &handle, sparse_matrix_t const &csr_m, - EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver, - vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) { +template +std::tuple partition(handle_t const& handle, + sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto cublas_h = handle.get_cublas_handle(); std::tuple - stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, + //cluster solver residual, # iters cluster solver vertex_t n = csr_m.nrows_; @@ -86,22 +90,20 @@ std::tuple partition( // Compute eigenvectors of Laplacian // Initialize Laplacian - ///sparse_matrix_t A{handle, graph}; + /// sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, csr_m}; auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; + auto nEigVecs = eigen_config.n_eigVecs; // Compute smallest eigenvalues and eigenvectors - std::get<0>(stats) = - eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); + std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); // Whiten eigenvector matrix transform_eigen_matrix(handle, n, nEigVecs, eigVecs); // Find partition clustering - auto pair_cluster = - cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); std::get<1>(stats) = pair_cluster.first; std::get<2>(stats) = pair_cluster.second; @@ -128,16 +130,19 @@ std::tuple partition( * @return error flag. */ template -void analyzePartition(handle_t const &handle, - sparse_matrix_t const &csr_m, - vertex_t nClusters, const vertex_t *__restrict__ clusters, - weight_t &edgeCut, weight_t &cost) { +void analyzePartition(handle_t const& handle, + sparse_matrix_t const& csr_m, + vertex_t nClusters, + const vertex_t* __restrict__ clusters, + weight_t& edgeCut, + weight_t& cost) +{ RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); vertex_t i; vertex_t n = csr_m.nrows_; - auto stream = handle.get_stream(); + auto stream = handle.get_stream(); auto cublas_h = handle.get_cublas_handle(); weight_t partEdgesCut, clustersize; @@ -147,22 +152,20 @@ void analyzePartition(handle_t const &handle, vector_t Lx(handle, n); // Initialize cuBLAS - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); // Initialize Laplacian - ///sparse_matrix_t A{handle, graph}; + /// sparse_matrix_t A{handle, graph}; laplacian_matrix_t L{handle, csr_m}; // Initialize output - cost = 0; + cost = 0; edgeCut = 0; // Iterate through partitions for (i = 0; i < nClusters; ++i) { // Construct indicator vector for ith partition - if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, - part_i, Lx, L)) { + if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { WARNING("empty partition"); continue; } diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp index c148350c0f..44b4af4bdc 100644 --- a/cpp/include/raft/spectral/spectral_util.hpp +++ b/cpp/include/raft/spectral/spectral_util.hpp @@ -27,20 +27,18 @@ namespace raft { namespace spectral { template -static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, - value_type_t* obs) { +static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs) +{ index_type_t i, j, k, index, mm; value_type_t alpha, v, last; bool valid; // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * - blockDim.x); // m in multiple of blockDim.x + mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x alpha = 0.0; - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < mm; i += blockDim.x) { // check if the thread is valid valid = i < m; @@ -65,17 +63,17 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, // scale by alpha alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x); alpha = std::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; - j += blockDim.y * gridDim.y) { + for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; + index = i + j * m; obs[index] = obs[index] / alpha; } } } template -index_type_t next_pow2(index_type_t n) { +index_type_t next_pow2(index_type_t n) +{ index_type_t v; // Reference: // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float @@ -89,7 +87,8 @@ index_type_t next_pow2(index_type_t n) { } template -cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { +cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) +{ index_type_t p2m; // find next power of 2 @@ -101,17 +100,16 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) { dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1}; // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel - <<>>(m, n, obs); + scale_obs_kernel<<>>(m, n, obs); return cudaSuccess; } template -void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, - weight_t* eigVecs) { - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); +void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, weight_t* eigVecs) +{ + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); const weight_t zero{0.0}; @@ -121,9 +119,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, for (auto i = 0; i < nEigVecs; ++i) { weight_t mean, std; - mean = thrust::reduce( - thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + mean = thrust::reduce(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); CHECK_CUDA(stream); mean /= n; thrust::transform(thrust_exec_policy, @@ -134,8 +132,7 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, thrust::minus()); CHECK_CUDA(stream); - CUBLAS_CHECK( - cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); std /= std::sqrt(static_cast(n)); @@ -152,16 +149,25 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, // TODO: in-place transpose { vector_t work(handle, nEigVecs * n); - CUBLAS_CHECK( - cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n, - &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs, - work.raw(), nEigVecs, stream)); - - CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(), - nEigVecs * n * sizeof(weight_t), - cudaMemcpyDeviceToDevice, stream)); + CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + CUBLAS_CHECK(cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); + + CUDA_TRY(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); } } @@ -176,9 +182,9 @@ struct equal_to_i_op { public: equal_to_i_op(index_type_t _i) : i(_i) {} template - __host__ __device__ void operator()(Tuple_ t) { - thrust::get<1>(t) = - (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; } }; } // namespace @@ -186,38 +192,38 @@ struct equal_to_i_op { // Construct indicator vector for ith partition // template -bool construct_indicator(handle_t const& handle, edge_t index, edge_t n, - weight_t& clustersize, weight_t& partStats, +bool construct_indicator(handle_t const& handle, + edge_t index, + edge_t n, + weight_t& clustersize, + weight_t& partStats, vertex_t const* __restrict__ clusters, - vector_t& part_i, vector_t& Bx, - laplacian_matrix_t const& B) { - auto stream = handle.get_stream(); - auto cublas_h = handle.get_cublas_handle(); + vector_t& part_i, + vector_t& Bx, + laplacian_matrix_t const& B) +{ + auto stream = handle.get_stream(); + auto cublas_h = handle.get_cublas_handle(); auto thrust_exec_policy = handle.get_thrust_policy(); - thrust::for_each(thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple( - thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(index)); + thrust::for_each( + thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(index)); CHECK_CUDA(stream); // Compute size of ith partition - CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, - &clustersize, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); clustersize = round(clustersize); - if (clustersize < 0.5) { - return false; - } + if (clustersize < 0.5) { return false; } // Compute part stats B.mv(1, part_i.raw(), 0, Bx.raw()); - CUBLAS_CHECK( - cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); return true; } diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp index 406f1b7c7e..08a4e6efb5 100644 --- a/cpp/include/raft/spectral/warn_dbg.hpp +++ b/cpp/include/raft/spectral/warn_dbg.hpp @@ -4,13 +4,13 @@ #include #define STRINGIFY_DETAIL(x) #x -#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) +#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x) #ifdef DEBUG #define COUT() (std::cout) #define CERR() (std::cerr) -//nope: +// nope: // #define WARNING(message) \ do { \ diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh index 1b338a035a..e8e6bea4dd 100644 --- a/cpp/include/raft/stats/detail/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -27,15 +27,15 @@ namespace detail { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -47,8 +47,8 @@ __global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D, } template -__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -58,30 +58,26 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - mu[blockIdx.x] = acc / N; - } + if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; } } template -void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, - bool rowMajor, cudaStream_t stream) { +void mean( + Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); - meanKernelRowMajor - <<>>(mu, data, D, N); + meanKernelRowMajor<<>>(mu, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); } else { - meanKernelColMajor - <<>>(mu, data, D, N); + meanKernelColMajor<<>>(mu, data, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh index e8917a60b3..42351269ea 100644 --- a/cpp/include/raft/stats/detail/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -27,15 +27,15 @@ namespace detail { ///@todo: ColPerBlk has been tested only for 32! template -__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, - IdxType N) { +__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) { Type val = (colId < D) ? data[i * D + colId] : Type(0); thread_data += val * val; @@ -49,41 +49,39 @@ __global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D, } template -__global__ void stddevKernelColMajor(Type *std, const Type *data, - const Type *mu, IdxType D, IdxType N) { +__global__ void stddevKernelColMajor( + Type* std, const Type* data, const Type* mu, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - std[blockIdx.x] = raft::mySqrt(acc / N); - } + if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); } } template -__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, - IdxType D, IdxType N) { +__global__ void varsKernelColMajor( + Type* var, const Type* data, const Type* mu, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; + Type m = mu[blockIdx.x]; for (IdxType i = threadIdx.x; i < N; i += TPB) { IdxType idx = colStart + i; - Type diff = data[idx] - m; + Type diff = data[idx] - m; thread_data += diff * diff; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - var[blockIdx.x] = acc / N; - } + if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; } } /** @@ -105,70 +103,78 @@ __global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu, * @param stream cuda stream where to launch work */ template -void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void stddev(Type* std, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D)); - stddevKernelRowMajor - <<>>(std, data, D, N); + stddevKernelRowMajor<<>>(std, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - std, std, mu, D, - [ratio] __device__(Type a, Type b) { - return raft::mySqrt(a * ratio - b * b); - }, + std, + std, + mu, + D, + [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); }, stream); } else { - stddevKernelColMajor - <<>>(std, data, mu, D, N); + stddevKernelColMajor<<>>(std, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } /** - * @brief Compute variance of the input matrix - * - * Variance operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param var the output stddev vector - * @param data the input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param sample whether to evaluate sample stddev or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - * @param rowMajor whether the input data is row or col major - * @param stream cuda stream where to launch work - */ + * @brief Compute variance of the input matrix + * + * Variance operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @param var the output stddev vector + * @param data the input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample stddev or not. In other words, + * whether + * to normalize the output using N-1 or N, for true or false, respectively + * @param rowMajor whether the input data is row or col major + * @param stream cuda stream where to launch work + */ template -void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void vars(Type* var, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D)); - stddevKernelRowMajor - <<>>(var, data, D, N); + stddevKernelRowMajor<<>>(var, data, D, N); Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( - var, var, mu, D, - [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); + var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); } else { - varsKernelColMajor - <<>>(var, data, mu, D, N); + varsKernelColMajor<<>>(var, data, mu, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh index 37a3313ed1..b7f5cc8ff7 100644 --- a/cpp/include/raft/stats/detail/sum.cuh +++ b/cpp/include/raft/stats/detail/sum.cuh @@ -27,15 +27,15 @@ namespace detail { ///@todo: ColsPerBlk has been tested only for 32! template -__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; + IdxType thisColId = threadIdx.x % ColsPerBlk; + IdxType thisRowId = threadIdx.x / ColsPerBlk; + IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); + IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); + Type thread_data = Type(0); + const IdxType stride = RowsPerBlkPerIter * gridDim.x; for (IdxType i = rowId; i < N; i += stride) thread_data += (colId < D) ? data[i * D + colId] : Type(0); __shared__ Type smu[ColsPerBlk]; @@ -47,8 +47,8 @@ __global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D, } template -__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, - IdxType N) { +__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; Type thread_data = Type(0); @@ -58,27 +58,23 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, thread_data += data[idx]; } Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { - mu[blockIdx.x] = acc; - } + if (threadIdx.x == 0) { mu[blockIdx.x] = acc; } } template -void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, - cudaStream_t stream) { +void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) +{ static const int TPB = 256; if (rowMajor) { static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), - raft::ceildiv(D, (IdxType)ColsPerBlk)); + static const int ColsPerBlk = 32; + static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; + dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D)); sumKernelRowMajor <<>>(output, input, D, N); } else { - sumKernelColMajor - <<>>(output, input, D, N); + sumKernelColMajor<<>>(output, input, D, N); } CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp index 6e4cf39850..ba1eb55e71 100644 --- a/cpp/include/raft/stats/mean.hpp +++ b/cpp/include/raft/stats/mean.hpp @@ -41,8 +41,9 @@ namespace stats { * @param stream: cuda stream */ template -void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, - bool rowMajor, cudaStream_t stream) { +void mean( + Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) +{ detail::mean(mu, data, D, N, sample, rowMajor, stream); } diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp index 04934d4388..c0ba24312b 100644 --- a/cpp/include/raft/stats/mean_center.hpp +++ b/cpp/include/raft/stats/mean_center.hpp @@ -38,12 +38,25 @@ namespace stats { * @param stream cuda stream where to launch work */ template -void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, - cudaStream_t stream) { +void meanCenter(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a - b; }, stream); + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a - b; }, + stream); } /** @@ -61,11 +74,25 @@ void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D, * @param stream cuda stream where to launch work */ template -void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, cudaStream_t stream) { +void meanAdd(Type* out, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + cudaStream_t stream) +{ raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, - [] __device__(Type a, Type b) { return a + b; }, stream); + out, + data, + mu, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(Type a, Type b) { return a + b; }, + stream); } }; // end namespace stats diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp index 17c5ae457d..9393dec8bc 100644 --- a/cpp/include/raft/stats/stddev.hpp +++ b/cpp/include/raft/stats/stddev.hpp @@ -42,8 +42,15 @@ namespace stats { * @param stream cuda stream where to launch work */ template -void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void stddev(Type* std, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ detail::stddev(std, data, mu, D, N, sample, rowMajor, stream); } @@ -66,8 +73,15 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, * @param stream cuda stream where to launch work */ template -void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, - bool sample, bool rowMajor, cudaStream_t stream) { +void vars(Type* var, + const Type* data, + const Type* mu, + IdxType D, + IdxType N, + bool sample, + bool rowMajor, + cudaStream_t stream) +{ detail::vars(var, data, mu, D, N, sample, rowMajor, stream); } diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp index 4f67acdf36..cfb5142a14 100644 --- a/cpp/include/raft/stats/sum.hpp +++ b/cpp/include/raft/stats/sum.hpp @@ -38,8 +38,8 @@ namespace stats { * @param stream cuda stream where to launch work */ template -void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, - cudaStream_t stream) { +void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) +{ detail::sum(output, input, D, N, rowMajor, stream); } diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh index ceffbcca78..b44d8bb4ad 100644 --- a/cpp/include/raft/vectorized.cuh +++ b/cpp/include/raft/vectorized.cuh @@ -22,11 +22,11 @@ namespace raft { template -struct IOType {}; +struct IOType { +}; template <> struct IOType { - static_assert(sizeof(bool) == sizeof(int8_t), - "IOType bool size assumption failed"); + static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed"); typedef int8_t Type; }; template <> @@ -215,50 +215,50 @@ struct IOType { }; /** - * @struct TxN_t - * - * @brief Internal data structure that is used to define a facade for vectorized - * loads/stores across the most common POD types. The goal of his file is to - * provide with CUDA programmers, an easy way to have compiler issue vectorized - * load or store instructions to memory (either global or shared). Vectorized - * accesses to memory are important as they'll utilize its resources - * efficiently, - * when compared to their non-vectorized counterparts. Obviously, for whatever - * reasons if one is unable to issue such vectorized operations, one can always - * fallback to using POD types. - * - * Concept of vectorized accesses : Threads process multiple elements - * to speed up processing. These are loaded in a single read thanks - * to type promotion. It is then reinterpreted as a vector elements - * to perform the kernel's work. - * - * Caution : vectorized accesses requires input adresses to be memory aligned - * according not to the input type but to the promoted type used for reading. - * - * Example demonstrating the use of load operations, performing math on such - * loaded data and finally storing it back. - * @code{.cu} - * TxN_t mydata1, mydata2; - * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; - * mydata1.load(ptr1, idx); - * mydata2.load(ptr2, idx); - * #pragma unroll - * for(int i=0;i type. - * Only change required is to replace variable declaration appropriately. - * - * Obviously, it's caller's responsibility to take care of pointer alignment! - * - * @tparam math_ the data-type in which the compute/math needs to happen - * @tparam veclen_ the number of 'math_' types to be loaded/stored per - * instruction - */ + * @struct TxN_t + * + * @brief Internal data structure that is used to define a facade for vectorized + * loads/stores across the most common POD types. The goal of his file is to + * provide with CUDA programmers, an easy way to have compiler issue vectorized + * load or store instructions to memory (either global or shared). Vectorized + * accesses to memory are important as they'll utilize its resources + * efficiently, + * when compared to their non-vectorized counterparts. Obviously, for whatever + * reasons if one is unable to issue such vectorized operations, one can always + * fallback to using POD types. + * + * Concept of vectorized accesses : Threads process multiple elements + * to speed up processing. These are loaded in a single read thanks + * to type promotion. It is then reinterpreted as a vector elements + * to perform the kernel's work. + * + * Caution : vectorized accesses requires input adresses to be memory aligned + * according not to the input type but to the promoted type used for reading. + * + * Example demonstrating the use of load operations, performing math on such + * loaded data and finally storing it back. + * @code{.cu} + * TxN_t mydata1, mydata2; + * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio; + * mydata1.load(ptr1, idx); + * mydata2.load(ptr2, idx); + * #pragma unroll + * for(int i=0;i type. + * Only change required is to replace variable declaration appropriately. + * + * Obviously, it's caller's responsibility to take care of pointer alignment! + * + * @tparam math_ the data-type in which the compute/math needs to happen + * @tparam veclen_ the number of 'math_' types to be loaded/stored per + * instruction + */ template struct TxN_t { /** underlying math data type */ @@ -282,7 +282,8 @@ struct TxN_t { * @brief Fill the contents of this structure with a constant value * @param _val the constant to be filled */ - DI void fill(math_t _val) { + DI void fill(math_t _val) + { #pragma unroll for (int i = 0; i < Ratio; ++i) { val.data[i] = _val; @@ -307,21 +308,24 @@ struct TxN_t { * @{ */ template - DI void load(const math_t *ptr, idx_t idx) { - const io_t *bptr = reinterpret_cast(&ptr[idx]); - val.internal = __ldg(bptr); + DI void load(const math_t* ptr, idx_t idx) + { + const io_t* bptr = reinterpret_cast(&ptr[idx]); + val.internal = __ldg(bptr); } template - DI void load(math_t *ptr, idx_t idx) { - io_t *bptr = reinterpret_cast(&ptr[idx]); + DI void load(math_t* ptr, idx_t idx) + { + io_t* bptr = reinterpret_cast(&ptr[idx]); val.internal = *bptr; } template - DI void store(math_t *ptr, idx_t idx) { - io_t *bptr = reinterpret_cast(&ptr[idx]); - *bptr = val.internal; + DI void store(math_t* ptr, idx_t idx) + { + io_t* bptr = reinterpret_cast(&ptr[idx]); + *bptr = val.internal; } /** @} */ }; @@ -338,11 +342,17 @@ struct TxN_t { DI void fill(math_t _val) {} template - DI void load(const math_t *ptr, idx_t idx) {} + DI void load(const math_t* ptr, idx_t idx) + { + } template - DI void load(math_t *ptr, idx_t idx) {} + DI void load(math_t* ptr, idx_t idx) + { + } template - DI void store(math_t *ptr, idx_t idx) {} + DI void store(math_t* ptr, idx_t idx) + { + } }; } // namespace raft diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu index 06b246d9a1..2c7996514a 100644 --- a/cpp/test/cluster_solvers.cu +++ b/cpp/test/cluster_solvers.cu @@ -23,7 +23,8 @@ namespace raft { -TEST(Raft, ClusterSolvers) { +TEST(Raft, ClusterSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -40,7 +41,7 @@ TEST(Raft, ClusterSolvers) { index_type d{10}; index_type k{5}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // value_type* eigvecs{nullptr}; index_type* codes{nullptr}; @@ -52,7 +53,8 @@ TEST(Raft, ClusterSolvers) { EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes)); } -TEST(Raft, ModularitySolvers) { +TEST(Raft, ModularitySolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -66,7 +68,7 @@ TEST(Raft, ModularitySolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -80,13 +82,11 @@ TEST(Raft, ModularitySolvers) { index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, - seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{clust_cfg}; auto stream = h.get_stream(); - sparse_matrix_t sm{h, nullptr, nullptr, - nullptr, 0, 0}; + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; EXPECT_ANY_THROW(spectral::modularity_maximization( h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp index c14d880efd..150767992f 100644 --- a/cpp/test/cudart_utils.cpp +++ b/cpp/test/cudart_utils.cpp @@ -20,7 +20,8 @@ namespace raft { -TEST(Raft, Utils) { +TEST(Raft, Utils) +{ ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!")); ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception); ASSERT_THROW(THROW("Should throw!"), exception); diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index efa1e2cd41..21d7e9d753 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -26,30 +26,42 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceAdjKernel(bool *dist, const DataType *x, - const DataType *y, int m, int n, int k, - DataType eps, bool isRowMajor) { +__global__ void naiveDistanceAdjKernel(bool* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + DataType eps, + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc <= eps; } template -void naiveDistanceAdj(bool *dist, const DataType *x, const DataType *y, int m, - int n, int k, DataType eps, bool isRowMajor) { +void naiveDistanceAdj(bool* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + DataType eps, + bool isRowMajor) +{ static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); - naiveDistanceAdjKernel - <<>>(dist, x, y, m, n, k, eps, isRowMajor); + naiveDistanceAdjKernel<<>>(dist, x, y, m, n, k, eps, isRowMajor); CUDA_CHECK(cudaPeekAtLastError()); } @@ -62,26 +74,28 @@ struct DistanceAdjInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const DistanceAdjInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs& dims) +{ return os; } template -class DistanceAdjTest - : public ::testing::TestWithParam> { +class DistanceAdjTest : public ::testing::TestWithParam> { public: DistanceAdjTest() : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), dist(params.m * params.n, stream), - dist_ref(params.m * params.n, stream) {} + dist_ref(params.m * params.n, stream) + { + } - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; bool isRowMajor = params.isRowMajor; rmm::device_uvector x(m * k, stream); @@ -92,21 +106,27 @@ class DistanceAdjTest DataType threshold = params.eps; - naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold, - isRowMajor); - size_t worksize = - raft::distance::getWorkspaceSize( + naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold, isRowMajor); + size_t worksize = raft::distance:: + getWorkspaceSize( x.data(), y.data(), m, n, k); rmm::device_uvector workspace(worksize, stream); auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) { return d_val <= threshold; }; - raft::distance::distance( - x.data(), y.data(), dist.data(), m, n, k, workspace.data(), - workspace.size(), fin_op, stream, isRowMajor); + raft::distance::distance( + x.data(), + y.data(), + dist.data(), + m, + n, + k, + workspace.data(), + workspace.size(), + fin_op, + stream, + isRowMajor); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -131,14 +151,13 @@ const std::vector> inputsf = { {10.0f, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestF; -TEST_P(DistanceAdjTestF, Result) { +TEST_P(DistanceAdjTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare())); + ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.01, 1024, 1024, 32, true, 1234ULL}, @@ -151,14 +170,13 @@ const std::vector> inputsd = { {10.0, 1024, 1024, 32, false, 1234ULL}, }; typedef DistanceAdjTest DistanceAdjTestD; -TEST_P(DistanceAdjTestD, Result) { +TEST_P(DistanceAdjTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE( - devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare())); + ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare())); } -INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd)); } // namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu index bddfdff3b6..db318605b4 100644 --- a/cpp/test/distance/dist_canberra.cu +++ b/cpp/test/distance/dist_canberra.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceCanberra - : public DistanceTest {}; +class DistanceCanberra : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraF; -TEST_P(DistanceCanberraF, Result) { +TEST_P(DistanceCanberraF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCanberra DistanceCanberraD; -TEST_P(DistanceCanberraD, Result) { +TEST_P(DistanceCanberraD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu index 0dc6edfaad..c7dccfe712 100644 --- a/cpp/test/distance/dist_chebyshev.cu +++ b/cpp/test/distance/dist_chebyshev.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceLinf - : public DistanceTest {}; +class DistanceLinf : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfF; -TEST_P(DistanceLinfF, Result) { +TEST_P(DistanceLinfF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceLinf DistanceLinfD; -TEST_P(DistanceLinfD, Result) { +TEST_P(DistanceLinfD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu index f6dc015738..0648ed96ca 100644 --- a/cpp/test/distance/dist_correlation.cu +++ b/cpp/test/distance/dist_correlation.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceCorrelation - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCorrelation DistanceCorrelationF; -TEST_P(DistanceCorrelationF, Result) { +TEST_P(DistanceCorrelationF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceCorrelation DistanceCorrelationD; -TEST_P(DistanceCorrelationD, Result) { +TEST_P(DistanceCorrelationD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu index 2487bcbd95..b3e6a4c97f 100644 --- a/cpp/test/distance/dist_cos.cu +++ b/cpp/test/distance/dist_cos.cu @@ -21,9 +21,8 @@ namespace raft { namespace distance { template -class DistanceExpCos - : public DistanceTest {}; +class DistanceExpCos : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosF; -TEST_P(DistanceExpCosF, Result) { +TEST_P(DistanceExpCosF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +55,14 @@ const std::vector> inputsd = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceExpCos DistanceExpCosD; -TEST_P(DistanceExpCosD, Result) { +TEST_P(DistanceExpCosD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu index a6ef01aa45..75ff7e682a 100644 --- a/cpp/test/distance/dist_euc_exp.cu +++ b/cpp/test/distance/dist_euc_exp.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceEucExpTest - : public DistanceTest {}; +class DistanceEucExpTest : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestF; -TEST_P(DistanceEucExpTestF, Result) { +TEST_P(DistanceEucExpTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucExpTest DistanceEucExpTestD; -TEST_P(DistanceEucExpTestD, Result) { +TEST_P(DistanceEucExpTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu index 290abda352..88affa16d5 100644 --- a/cpp/test/distance/dist_euc_unexp.cu +++ b/cpp/test/distance/dist_euc_unexp.cu @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestF; -TEST_P(DistanceEucUnexpTestF, Result) { +TEST_P(DistanceEucUnexpTestF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceEucUnexpTest DistanceEucUnexpTestD; -TEST_P(DistanceEucUnexpTestD, Result) { +TEST_P(DistanceEucUnexpTestD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu index 0123c8bada..631adc751c 100644 --- a/cpp/test/distance/dist_hamming.cu +++ b/cpp/test/distance/dist_hamming.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceHamming - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHamming DistanceHammingF; -TEST_P(DistanceHammingF, Result) { +TEST_P(DistanceHammingF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHamming DistanceHammingD; -TEST_P(DistanceHammingD, Result) { +TEST_P(DistanceHammingD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu index 39d197f786..8a07c8836f 100644 --- a/cpp/test/distance/dist_hellinger.cu +++ b/cpp/test/distance/dist_hellinger.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceHellingerExp - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpF; -TEST_P(DistanceHellingerExpF, Result) { +TEST_P(DistanceHellingerExpF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceHellingerExp DistanceHellingerExpD; -TEST_P(DistanceHellingerExpD, Result) { +TEST_P(DistanceHellingerExpD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu index 9070ce92c1..3cda31a852 100644 --- a/cpp/test/distance/dist_jensen_shannon.cu +++ b/cpp/test/distance/dist_jensen_shannon.cu @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceJensenShannon DistanceJensenShannonF; -TEST_P(DistanceJensenShannonF, Result) { +TEST_P(DistanceJensenShannonF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceJensenShannon DistanceJensenShannonD; -TEST_P(DistanceJensenShannonD, Result) { +TEST_P(DistanceJensenShannonD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu index 7c32596527..4303b8cc8f 100644 --- a/cpp/test/distance/dist_kl_divergence.cu +++ b/cpp/test/distance/dist_kl_divergence.cu @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceKLDivergence DistanceKLDivergenceF; -TEST_P(DistanceKLDivergenceF, Result) { +TEST_P(DistanceKLDivergenceF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceKLDivergence DistanceKLDivergenceD; -TEST_P(DistanceKLDivergenceD, Result) { +TEST_P(DistanceKLDivergenceD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu index ff7705d195..dad160ca41 100644 --- a/cpp/test/distance/dist_l1.cu +++ b/cpp/test/distance/dist_l1.cu @@ -21,8 +21,8 @@ namespace raft { namespace distance { template -class DistanceUnexpL1 - : public DistanceTest {}; +class DistanceUnexpL1 : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -35,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1F; -TEST_P(DistanceUnexpL1F, Result) { +TEST_P(DistanceUnexpL1F, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -55,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceUnexpL1 DistanceUnexpL1D; -TEST_P(DistanceUnexpL1D, Result) { +TEST_P(DistanceUnexpL1D, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu index 7d87bbc2c7..34f6d2825e 100644 --- a/cpp/test/distance/dist_minkowski.cu +++ b/cpp/test/distance/dist_minkowski.cu @@ -21,8 +21,7 @@ namespace raft { namespace distance { template -class DistanceLpUnexp - : public DistanceTest { +class DistanceLpUnexp : public DistanceTest { }; const std::vector> inputsf = { @@ -36,14 +35,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f}, }; typedef DistanceLpUnexp DistanceLpUnexpF; -TEST_P(DistanceLpUnexpF, Result) { +TEST_P(DistanceLpUnexpF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL, 4.0}, @@ -56,14 +55,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0}, }; typedef DistanceLpUnexp DistanceLpUnexpD; -TEST_P(DistanceLpUnexpD, Result) { +TEST_P(DistanceLpUnexpD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu index ae735951a8..e0bfcd7eb3 100644 --- a/cpp/test/distance/dist_russell_rao.cu +++ b/cpp/test/distance/dist_russell_rao.cu @@ -22,8 +22,8 @@ namespace distance { template class DistanceRussellRao - : public DistanceTest {}; + : public DistanceTest { +}; const std::vector> inputsf = { {0.001f, 1024, 1024, 32, true, 1234ULL}, @@ -36,14 +36,14 @@ const std::vector> inputsf = { {0.003f, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceRussellRao DistanceRussellRaoF; -TEST_P(DistanceRussellRaoF, Result) { +TEST_P(DistanceRussellRaoF, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.001, 1024, 1024, 32, true, 1234ULL}, @@ -56,14 +56,14 @@ const std::vector> inputsd = { {0.003, 1024, 1024, 1024, false, 1234ULL}, }; typedef DistanceRussellRao DistanceRussellRaoD; -TEST_P(DistanceRussellRaoD, Result) { +TEST_P(DistanceRussellRaoD, Result) +{ int m = params.isRowMajor ? params.m : params.n; int n = params.isRowMajor ? params.n : params.m; - ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + dist_ref.data(), dist.data(), m, n, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index f31fbc9165..f445e3b578 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -25,43 +25,52 @@ namespace raft { namespace distance { template -__global__ void naiveDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, int k, +__global__ void naiveDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, raft::distance::DistanceType type, - bool isRowMajor) { + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; auto diff = x[xidx] - y[yidx]; acc += diff * diff; } if (type == raft::distance::DistanceType::L2SqrtExpanded || type == raft::distance::DistanceType::L2SqrtUnexpanded) acc = raft::mySqrt(acc); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveL1_Linf_CanberraDistanceKernel( - DataType *dist, const DataType *x, const DataType *y, int m, int n, int k, - raft::distance::DistanceType type, bool isRowMajor) { +__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + raft::distance::DistanceType type, + bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = (a > b) ? (a - b) : (b - a); if (type == raft::distance::DistanceType::Linf) { acc = raft::myMax(acc, diff); @@ -75,29 +84,27 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel( } } - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveCosineDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } - DataType acc_a = DataType(0); - DataType acc_b = DataType(0); + DataType acc_a = DataType(0); + DataType acc_b = DataType(0); DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_a += a * a; acc_b += b * b; acc_ab += a * b; @@ -106,64 +113,67 @@ __global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x, int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Use 1.0 - (cosine similarity) to calc the distance - dist[outidx] = - (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); + dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b)); } template -__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveHellingerDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if (midx >= m || nidx >= n) { - return; - } + if (midx >= m || nidx >= n) { return; } DataType acc_ab = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc_ab += raft::mySqrt(a) * raft::mySqrt(b); } int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - acc_ab = 1 - acc_ab; + acc_ab = 1 - acc_ab; auto rectifier = (!signbit(acc_ab)); - dist[outidx] = raft::mySqrt(rectifier * acc_ab); + dist[outidx] = raft::mySqrt(rectifier * acc_ab); } template -__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor, DataType p) { +__global__ void naiveLpUnexpDistanceKernel(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + bool isRowMajor, + DataType p) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; DataType acc = DataType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; auto diff = raft::L1Op()(a - b); acc += raft::myPow(diff, p); } auto one_over_p = 1 / p; - acc = raft::myPow(acc, one_over_p); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; - dist[outidx] = acc; + acc = raft::myPow(acc, one_over_p); + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + dist[outidx] = acc; } template -__global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveHammingDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; @@ -171,21 +181,19 @@ __global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x, for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc += (a != b); } - acc = acc / k; - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + acc = acc / k; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveJensenShannonDistanceKernel(DataType *dist, - const DataType *x, - const DataType *y, int m, - int n, int k, - bool isRowMajor) { +__global__ void naiveJensenShannonDistanceKernel( + DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; @@ -193,10 +201,10 @@ __global__ void naiveJensenShannonDistanceKernel(DataType *dist, for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; - DataType m = 0.5f * (a + b); + DataType m = 0.5f * (a + b); bool a_zero = a == 0; bool b_zero = b == 0; @@ -206,18 +214,17 @@ __global__ void naiveJensenShannonDistanceKernel(DataType *dist, bool p_zero = p == 0; bool q_zero = q == 0; - acc += - (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero))); + acc += (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero))); } - acc = raft::mySqrt(0.5f * acc); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + acc = raft::mySqrt(0.5f * acc); + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveRussellRaoDistanceKernel( + OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; @@ -225,56 +232,55 @@ __global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x, for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; acc += (a * b); } - acc = (k - acc) / k; - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + acc = (k - acc) / k; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveKLDivergenceDistanceKernel(OutType *dist, - const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveKLDivergenceDistanceKernel( + OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; OutType acc = OutType(0); for (int i = 0; i < k; ++i) { - int xidx = isRowMajor ? i + midx * k : i * m + midx; - int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; - bool b_zero = (b == 0); - const auto m = (!b_zero) * (a / b); + int xidx = isRowMajor ? i + midx * k : i * m + midx; + int yidx = isRowMajor ? i + nidx * k : i * n + nidx; + auto a = x[xidx]; + auto b = y[yidx]; + bool b_zero = (b == 0); + const auto m = (!b_zero) * (a / b); const bool m_zero = (m == 0); acc += (a * (!m_zero) * log(m + m_zero)); } - acc = 0.5f * acc; - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + acc = 0.5f * acc; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -__global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x, - const DataType *y, int m, int n, - int k, bool isRowMajor) { +__global__ void naiveCorrelationDistanceKernel( + OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor) +{ int midx = threadIdx.x + blockIdx.x * blockDim.x; int nidx = threadIdx.y + blockIdx.y * blockDim.y; if (midx >= m || nidx >= n) return; - OutType acc = OutType(0); - auto a_norm = DataType(0); - auto b_norm = DataType(0); + OutType acc = OutType(0); + auto a_norm = DataType(0); + auto b_norm = DataType(0); auto a_sq_norm = DataType(0); auto b_sq_norm = DataType(0); for (int i = 0; i < k; ++i) { int xidx = isRowMajor ? i + midx * k : i * m + midx; int yidx = isRowMajor ? i + nidx * k : i * n + nidx; - auto a = x[xidx]; - auto b = y[yidx]; + auto a = x[xidx]; + auto b = y[yidx]; a_norm += a; b_norm += b; a_sq_norm += (a * a); @@ -282,20 +288,27 @@ __global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x, acc += (a * b); } - auto numer = k * acc - (a_norm * b_norm); + auto numer = k * acc - (a_norm * b_norm); auto Q_denom = k * a_sq_norm - (a_norm * a_norm); auto R_denom = k * b_sq_norm - (b_norm * b_norm); acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom)); - int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; + int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx; dist[outidx] = acc; } template -void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, - int n, int k, raft::distance::DistanceType type, - bool isRowMajor, DataType metric_arg = 2.0f) { +void naiveDistance(DataType* dist, + const DataType* x, + const DataType* y, + int m, + int n, + int k, + raft::distance::DistanceType type, + bool isRowMajor, + DataType metric_arg = 2.0f) +{ static const dim3 TPB(16, 32, 1); dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1); @@ -310,43 +323,34 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m, case raft::distance::DistanceType::L2Unexpanded: case raft::distance::DistanceType::L2SqrtExpanded: case raft::distance::DistanceType::L2Expanded: - naiveDistanceKernel - <<>>(dist, x, y, m, n, k, type, isRowMajor); + naiveDistanceKernel<<>>(dist, x, y, m, n, k, type, isRowMajor); break; case raft::distance::DistanceType::CosineExpanded: - naiveCosineDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveCosineDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::HellingerExpanded: - naiveHellingerDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveHellingerDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::LpUnexpanded: naiveLpUnexpDistanceKernel <<>>(dist, x, y, m, n, k, isRowMajor, metric_arg); break; case raft::distance::DistanceType::HammingUnexpanded: - naiveHammingDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveHammingDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::JensenShannon: - naiveJensenShannonDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveJensenShannonDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::RusselRaoExpanded: - naiveRussellRaoDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveRussellRaoDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::KLDivergence: - naiveKLDivergenceDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveKLDivergenceDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; case raft::distance::DistanceType::CorrelationExpanded: - naiveCorrelationDistanceKernel - <<>>(dist, x, y, m, n, k, isRowMajor); + naiveCorrelationDistanceKernel<<>>(dist, x, y, m, n, k, isRowMajor); break; - default: - FAIL() << "should be here\n"; + default: FAIL() << "should be here\n"; } CUDA_CHECK(cudaPeekAtLastError()); } @@ -361,24 +365,33 @@ struct DistanceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const DistanceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const DistanceInputs& dims) +{ return os; } template -void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2, - int m, int n, int k, DistanceInputs ¶ms, - DataType threshold, char *workspace, size_t worksize, - cudaStream_t stream, bool isRowMajor, - DataType metric_arg = 2.0f) { +void distanceLauncher(DataType* x, + DataType* y, + DataType* dist, + DataType* dist2, + int m, + int n, + int k, + DistanceInputs& params, + DataType threshold, + char* workspace, + size_t worksize, + cudaStream_t stream, + bool isRowMajor, + DataType metric_arg = 2.0f) +{ auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) { dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val; return d_val; }; raft::distance::distance( - x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, - metric_arg); + x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg); } template @@ -391,23 +404,25 @@ class DistanceTest : public ::testing::TestWithParam> { y(params.n * params.k, stream), dist_ref(params.m * params.n, stream), dist(params.m * params.n, stream), - dist2(params.m * params.n, stream) {} + dist2(params.m * params.n, stream) + { + } - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); - int m = params.m; - int n = params.n; - int k = params.k; + int m = params.m; + int n = params.n; + int k = params.k; DataType metric_arg = params.metric_arg; - bool isRowMajor = params.isRowMajor; + bool isRowMajor = params.isRowMajor; if (distanceType == raft::distance::DistanceType::HellingerExpanded || distanceType == raft::distance::DistanceType::JensenShannon || distanceType == raft::distance::DistanceType::KLDivergence) { // Hellinger works only on positive numbers r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream); r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream); - } else if (distanceType == - raft::distance::DistanceType::RusselRaoExpanded) { + } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded) { r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream); r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream); // Russel rao works on boolean values. @@ -417,17 +432,27 @@ class DistanceTest : public ::testing::TestWithParam> { r.uniform(x.data(), m * k, DataType(-1.0), DataType(1.0), stream); r.uniform(y.data(), n * k, DataType(-1.0), DataType(1.0), stream); } - naiveDistance(dist_ref.data(), x.data(), y.data(), m, n, k, distanceType, - isRowMajor, metric_arg); - size_t worksize = - raft::distance::getWorkspaceSize(x.data(), y.data(), m, n, k); + naiveDistance( + dist_ref.data(), x.data(), y.data(), m, n, k, distanceType, isRowMajor, metric_arg); + size_t worksize = raft::distance::getWorkspaceSize( + x.data(), y.data(), m, n, k); rmm::device_uvector workspace(worksize, stream); DataType threshold = -10000.f; - distanceLauncher( - x.data(), y.data(), dist.data(), dist2.data(), m, n, k, params, threshold, - workspace.data(), workspace.size(), stream, isRowMajor, metric_arg); + distanceLauncher(x.data(), + y.data(), + dist.data(), + dist2.data(), + m, + n, + k, + params, + threshold, + workspace.data(), + workspace.size(), + stream, + isRowMajor, + metric_arg); CUDA_CHECK(cudaStreamSynchronize(stream)); } diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 33782baf8d..932857c536 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -30,40 +30,40 @@ template struct CubKVPMinReduce { typedef cub::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - DI KVP operator()(const KVP &a, const KVP &b) { - return b.value < a.value ? b : a; - } + DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce template -__global__ void naiveKernel(cub::KeyValuePair *min, DataT *x, - DataT *y, int m, int n, int k, int *workspace, - DataT maxVal) { - int midx = threadIdx.y + blockIdx.y * blockDim.y; - int nidx = threadIdx.x + blockIdx.x * blockDim.x; +__global__ void naiveKernel(cub::KeyValuePair* min, + DataT* x, + DataT* y, + int m, + int n, + int k, + int* workspace, + DataT maxVal) +{ + int midx = threadIdx.y + blockIdx.y * blockDim.y; + int nidx = threadIdx.x + blockIdx.x * blockDim.x; DataT acc = DataT(0); for (int i = 0; i < k; ++i) { - int xidx = i + midx * k; - int yidx = i + nidx * k; + int xidx = i + midx * k; + int yidx = i + nidx * k; auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx]; acc += diff * diff; } - if (Sqrt) { - acc = raft::mySqrt(acc); - } + if (Sqrt) { acc = raft::mySqrt(acc); } ReduceOpT redOp; typedef cub::WarpReduce> WarpReduce; __shared__ typename WarpReduce::TempStorage temp[NWARPS]; int warpId = threadIdx.x / raft::WarpSize; cub::KeyValuePair tmp; - tmp.key = nidx; + tmp.key = nidx; tmp.value = midx >= m || nidx >= n ? maxVal : acc; - tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); + tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); if (threadIdx.x % raft::WarpSize == 0 && midx < m) { while (atomicCAS(workspace + midx, 0, 1) == 1) ; @@ -75,8 +75,15 @@ __global__ void naiveKernel(cub::KeyValuePair *min, DataT *x, } template -void naive(cub::KeyValuePair *min, DataT *x, DataT *y, int m, int n, - int k, int *workspace, cudaStream_t stream) { +void naive(cub::KeyValuePair* min, + DataT* x, + DataT* y, + int m, + int n, + int k, + int* workspace, + cudaStream_t stream) +{ static const dim3 TPB(32, 16, 1); dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1); CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); @@ -86,8 +93,7 @@ void naive(cub::KeyValuePair *min, DataT *x, DataT *y, int m, int n, <<>>(min, m, std::numeric_limits::max(), op); CUDA_CHECK(cudaGetLastError()); naiveKernel, 16> - <<>>(min, x, y, m, n, k, workspace, - std::numeric_limits::max()); + <<>>(min, x, y, m, n, k, workspace, std::numeric_limits::max()); CUDA_CHECK(cudaGetLastError()); } @@ -110,10 +116,13 @@ class FusedL2NNTest : public ::testing::TestWithParam> { yn(params.n, stream), min(params.m, stream), min_ref(params.m, stream), - workspace(params.m * sizeof(int), stream) {} + workspace(params.m * sizeof(int), stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int m = params.m; int n = params.n; @@ -121,10 +130,8 @@ class FusedL2NNTest : public ::testing::TestWithParam> { r.uniform(x.data(), m * k, DataT(-1.0), DataT(1.0), stream); r.uniform(y.data(), n * k, DataT(-1.0), DataT(1.0), stream); generateGoldenResult(); - raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true, - stream); - raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true, - stream); + raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true, stream); + raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -140,23 +147,34 @@ class FusedL2NNTest : public ::testing::TestWithParam> { raft::handle_t handle; cudaStream_t stream; - virtual void generateGoldenResult() { + virtual void generateGoldenResult() + { int m = params.m; int n = params.n; int k = params.k; - naive(min_ref.data(), x.data(), y.data(), m, n, k, - (int *)workspace.data(), stream); + naive(min_ref.data(), x.data(), y.data(), m, n, k, (int*)workspace.data(), stream); } - void runTest(cub::KeyValuePair *out) { + void runTest(cub::KeyValuePair* out) + { int m = params.m; int n = params.n; int k = params.k; MinAndDistanceReduceOp redOp; - fusedL2NN, int>( - out, x.data(), y.data(), xn.data(), yn.data(), m, n, k, - (void *)workspace.data(), redOp, - raft::distance::KVPMinReduce(), Sqrt, true, stream); + fusedL2NN, int>(out, + x.data(), + y.data(), + xn.data(), + yn.data(), + m, + n, + k, + (void*)workspace.data(), + redOp, + raft::distance::KVPMinReduce(), + Sqrt, + true, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } }; @@ -165,9 +183,10 @@ template struct CompareApproxAbsKVP { typedef typename cub::KeyValuePair KVP; CompareApproxAbsKVP(T eps_) : eps(eps_) {} - bool operator()(const KVP &a, const KVP &b) const { - T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); - T m = std::max(raft::abs(a.value), raft::abs(b.value)); + bool operator()(const KVP& a, const KVP& b) const + { + T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value)); + T m = std::max(raft::abs(a.value), raft::abs(b.value)); T ratio = m >= eps ? diff / m : diff; return (ratio <= eps); } @@ -179,17 +198,20 @@ struct CompareApproxAbsKVP { template struct CompareExactKVP { typedef typename cub::KeyValuePair KVP; - bool operator()(const KVP &a, const KVP &b) const { + bool operator()(const KVP& a, const KVP& b) const + { if (a.value != b.value) return false; return true; } }; template -::testing::AssertionResult devArrMatch(const cub::KeyValuePair *expected, - const cub::KeyValuePair *actual, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +::testing::AssertionResult devArrMatch(const cub::KeyValuePair* expected, + const cub::KeyValuePair* actual, + size_t size, + L eq_compare, + cudaStream_t stream = 0) +{ typedef typename cub::KeyValuePair KVP; std::shared_ptr exp_h(new KVP[size]); std::shared_ptr act_h(new KVP[size]); @@ -201,47 +223,44 @@ template auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { return ::testing::AssertionFailure() - << "actual=" << act.key << "," << act.value - << " != expected=" << exp.key << "," << exp.value << " @" << i; + << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << "," + << exp.value << " @" << i; } } return ::testing::AssertionSuccess(); } const std::vector> inputsf = { - {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, - {0.001f, 64, 32, 32, 1234ULL}, {0.001f, 64, 64, 32, 1234ULL}, - {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, + {0.001f, 32, 32, 32, 1234ULL}, {0.001f, 32, 64, 32, 1234ULL}, {0.001f, 64, 32, 32, 1234ULL}, + {0.001f, 64, 64, 32, 1234ULL}, {0.001f, 128, 32, 32, 1234ULL}, {0.001f, 128, 64, 32, 1234ULL}, {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL}, - {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, - {0.001f, 64, 32, 34, 1234ULL}, {0.001f, 64, 64, 34, 1234ULL}, - {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, + {0.001f, 32, 32, 34, 1234ULL}, {0.001f, 32, 64, 34, 1234ULL}, {0.001f, 64, 32, 34, 1234ULL}, + {0.001f, 64, 64, 34, 1234ULL}, {0.001f, 128, 32, 34, 1234ULL}, {0.001f, 128, 64, 34, 1234ULL}, {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL}, - {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, - {0.001f, 64, 32, 33, 1234ULL}, {0.001f, 64, 64, 33, 1234ULL}, - {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, + {0.001f, 32, 32, 33, 1234ULL}, {0.001f, 32, 64, 33, 1234ULL}, {0.001f, 64, 32, 33, 1234ULL}, + {0.001f, 64, 64, 33, 1234ULL}, {0.001f, 128, 32, 33, 1234ULL}, {0.001f, 128, 64, 33, 1234ULL}, {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL}, {0.006f, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestF_Sq; -TEST_P(FusedL2NNTestF_Sq, Result) { +TEST_P(FusedL2NNTestF_Sq, Result) +{ runTest(min.data()); - ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch( + min_ref.data(), min.data(), params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf)); typedef FusedL2NNTest FusedL2NNTestF_Sqrt; -TEST_P(FusedL2NNTestF_Sqrt, Result) { +TEST_P(FusedL2NNTestF_Sqrt, Result) +{ runTest(min.data()); - ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch( + min_ref.data(), min.data(), params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf)); const std::vector> inputsd = { {0.00001, 32, 32, 32, 1234ULL}, {0.00001, 32, 64, 32, 1234ULL}, @@ -262,21 +281,21 @@ const std::vector> inputsd = { {0.00001, 1805, 134, 2, 1234ULL}, }; typedef FusedL2NNTest FusedL2NNTestD_Sq; -TEST_P(FusedL2NNTestD_Sq, Result) { +TEST_P(FusedL2NNTestD_Sq, Result) +{ runTest(min.data()); - ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch( + min_ref.data(), min.data(), params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd)); typedef FusedL2NNTest FusedL2NNTestD_Sqrt; -TEST_P(FusedL2NNTestD_Sqrt, Result) { +TEST_P(FusedL2NNTestD_Sqrt, Result) +{ runTest(min.data()); - ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m, - CompareApproxAbsKVP(params.tolerance))); + ASSERT_TRUE(devArrMatch( + min_ref.data(), min.data(), params.m, CompareApproxAbsKVP(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd)); /// This is to test output determinism of the prim template @@ -284,7 +303,8 @@ class FusedL2NNDetTest : public FusedL2NNTest { public: FusedL2NNDetTest() : stream(handle.get_stream()), min1(0, stream) {} - void SetUp() override { + void SetUp() override + { FusedL2NNTest::SetUp(); int m = this->params.m; min1.resize(m, stream); @@ -305,50 +325,46 @@ class FusedL2NNDetTest : public FusedL2NNTest { }; typedef FusedL2NNDetTest FusedL2NNDetTestF_Sq; -TEST_P(FusedL2NNDetTestF_Sq, Result) { +TEST_P(FusedL2NNDetTestF_Sq, Result) +{ runTest(min.data()); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1.data()); - ASSERT_TRUE( - devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); + ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestF_Sqrt; -TEST_P(FusedL2NNDetTestF_Sqrt, Result) { +TEST_P(FusedL2NNDetTestF_Sqrt, Result) +{ runTest(min.data()); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1.data()); - ASSERT_TRUE( - devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); + ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sq; -TEST_P(FusedL2NNDetTestD_Sq, Result) { +TEST_P(FusedL2NNDetTestD_Sq, Result) +{ runTest(min.data()); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1.data()); - ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, - CompareExactKVP())); + ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd)); typedef FusedL2NNDetTest FusedL2NNDetTestD_Sqrt; -TEST_P(FusedL2NNDetTestD_Sqrt, Result) { +TEST_P(FusedL2NNDetTestD_Sqrt, Result) +{ runTest(min.data()); // assumed to be golden for (int i = 0; i < NumRepeats; ++i) { runTest(min1.data()); - ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, - CompareExactKVP())); + ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP())); } } -INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd)); } // end namespace distance } // end namespace raft diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu index ede790b38c..dc7de92eb8 100644 --- a/cpp/test/eigen_solvers.cu +++ b/cpp/test/eigen_solvers.cu @@ -25,7 +25,8 @@ namespace raft { -TEST(Raft, EigenSolvers) { +TEST(Raft, EigenSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -36,7 +37,7 @@ TEST(Raft, EigenSolvers) { index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; @@ -48,7 +49,7 @@ TEST(Raft, EigenSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // value_type* eigvals{nullptr}; value_type* eigvecs{nullptr}; @@ -59,14 +60,13 @@ TEST(Raft, EigenSolvers) { lanczos_solver_t eig_solver{cfg}; - EXPECT_ANY_THROW( - eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); - EXPECT_ANY_THROW( - eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); + EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); } -TEST(Raft, SpectralSolvers) { +TEST(Raft, SpectralSolvers) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -80,7 +80,7 @@ TEST(Raft, SpectralSolvers) { value_type tol{1.0e-10}; bool reorthog{true}; - //nullptr expected to trigger exceptions: + // nullptr expected to trigger exceptions: // index_type* clusters{nullptr}; value_type* eigvals{nullptr}; @@ -94,19 +94,16 @@ TEST(Raft, SpectralSolvers) { index_type k{5}; - cluster_solver_config_t clust_cfg{k, maxiter, tol, - seed}; + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; kmeans_solver_t cluster_solver{clust_cfg}; - sparse_matrix_t sm{h, nullptr, nullptr, - nullptr, 0, 0}; - EXPECT_ANY_THROW(spectral::partition(h, sm, eig_solver, cluster_solver, - clusters, eigvals, eigvecs)); + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; + EXPECT_ANY_THROW( + spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); value_type edgeCut{0}; value_type cost{0}; - EXPECT_ANY_THROW( - spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); + EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); } } // namespace raft diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp index 3e27789078..698a601e85 100644 --- a/cpp/test/handle.cpp +++ b/cpp/test/handle.cpp @@ -22,7 +22,8 @@ namespace raft { -TEST(Raft, HandleDefault) { +TEST(Raft, HandleDefault) +{ handle_t h; ASSERT_EQ(0, h.get_device()); ASSERT_EQ(nullptr, h.get_stream()); @@ -32,7 +33,8 @@ TEST(Raft, HandleDefault) { ASSERT_NE(nullptr, h.get_cusparse_handle()); } -TEST(Raft, Handle) { +TEST(Raft, Handle) +{ handle_t h(4); ASSERT_EQ(4, h.get_num_internal_streams()); cudaStream_t stream; @@ -43,13 +45,15 @@ TEST(Raft, Handle) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, GetInternalStreams) { +TEST(Raft, GetInternalStreams) +{ handle_t h(4); auto streams = h.get_internal_streams(); ASSERT_EQ(4U, streams.size()); } -TEST(Raft, GetHandleFromPool) { +TEST(Raft, GetHandleFromPool) +{ handle_t parent(4); handle_t child(parent, 2); @@ -62,13 +66,13 @@ TEST(Raft, GetHandleFromPool) { ASSERT_EQ(parent.get_device(), child.get_device()); } -TEST(Raft, GetHandleStreamViews) { +TEST(Raft, GetHandleStreamViews) +{ handle_t parent(4); handle_t child(parent, 2); ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view()); - ASSERT_EQ(parent.get_internal_stream_view(2).value(), - child.get_stream_view().value()); + ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value()); EXPECT_FALSE(child.get_stream_view().is_default()); } } // namespace raft diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp index 830d085a40..d883de59fe 100644 --- a/cpp/test/integer_utils.cpp +++ b/cpp/test/integer_utils.cpp @@ -20,7 +20,8 @@ namespace raft { -TEST(Raft, rounding_up) { +TEST(Raft, rounding_up) +{ ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2); ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0); ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1); @@ -29,7 +30,8 @@ TEST(Raft, rounding_up) { ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1); } -TEST(Raft, is_a_power_of_two) { +TEST(Raft, is_a_power_of_two) +{ ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true); ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false); } diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu index f79d8f10c8..d983ec1162 100644 --- a/cpp/test/label/label.cu +++ b/cpp/test/label/label.cu @@ -35,7 +35,8 @@ class labelTest : public ::testing::Test { }; typedef labelTest MakeMonotonicTest; -TEST_F(MakeMonotonicTest, Result) { +TEST_F(MakeMonotonicTest, Result) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -45,11 +46,9 @@ TEST_F(MakeMonotonicTest, Result) { rmm::device_uvector actual(m, stream); rmm::device_uvector expected(m, stream); - float *data_h = - new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; + float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0}; - float *expected_h = - new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; + float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0}; raft::update_device(data.data(), data_h, m, stream); raft::update_device(expected.data(), expected_h, m, stream); @@ -58,14 +57,14 @@ TEST_F(MakeMonotonicTest, Result) { CUDA_CHECK(cudaStreamSynchronize(stream)); - ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, - raft::Compare(), stream)); + ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, raft::Compare(), stream)); delete data_h; delete expected_h; } -TEST(labelTest, Classlabels) { +TEST(labelTest, Classlabels) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -81,17 +80,16 @@ TEST(labelTest, Classlabels) { ASSERT_EQ(n_classes, 3); float y_unique_exp[] = {-1, 1, 2}; - EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, - raft::Compare(), stream)); + EXPECT_TRUE( + devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, raft::Compare(), stream)); rmm::device_uvector y_relabeled_d(n_rows, stream); - getOvrlabels(y_d.data(), n_rows, y_unique_d.data(), n_classes, - y_relabeled_d.data(), 2, stream); + getOvrlabels(y_d.data(), n_rows, y_unique_d.data(), n_classes, y_relabeled_d.data(), 2, stream); float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1}; - EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows, - raft::Compare(), stream)); + EXPECT_TRUE( + devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows, raft::Compare(), stream)); } }; // namespace label }; // namespace raft diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu index 76e0a4295e..dd67f0fd89 100644 --- a/cpp/test/label/merge_labels.cu +++ b/cpp/test/label/merge_labels.cu @@ -39,8 +39,7 @@ struct MergeLabelsInputs { }; template -class MergeLabelsTest - : public ::testing::TestWithParam> { +class MergeLabelsTest : public ::testing::TestWithParam> { protected: MergeLabelsTest() : params(::testing::TestWithParam>::GetParam()), @@ -50,25 +49,23 @@ class MergeLabelsTest expected(params.N, stream), R(params.N, stream), mask(params.N, stream), - m(stream) {} - - void Run() { - raft::update_device(labels_a.data(), params.labels_a.data(), params.N, - stream); - raft::update_device(labels_b.data(), params.labels_b.data(), params.N, - stream); - raft::update_device(expected.data(), params.expected.data(), params.N, - stream); - raft::update_device(mask.data(), - reinterpret_cast(params.mask.data()), params.N, - stream); - - merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(), - m.data(), params.N, stream); + m(stream) + { + } + + void Run() + { + raft::update_device(labels_a.data(), params.labels_a.data(), params.N, stream); + raft::update_device(labels_b.data(), params.labels_b.data(), params.N, stream); + raft::update_device(expected.data(), params.expected.data(), params.N, stream); + raft::update_device(mask.data(), reinterpret_cast(params.mask.data()), params.N, stream); + + merge_labels( + labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream); cudaStreamSynchronize(stream); - ASSERT_TRUE(raft::devArrMatch(expected.data(), labels_a.data(), - params.N, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch( + expected.data(), labels_a.data(), params.N, raft::Compare())); } protected: @@ -86,22 +83,14 @@ TEST_P(MergeLabelsTestI, Result) { Run(); } using MergeLabelsTestL = MergeLabelsTest; TEST_P(MergeLabelsTestL, Result) { Run(); } -constexpr int MAX32 = std::numeric_limits::max(); +constexpr int MAX32 = std::numeric_limits::max(); constexpr int64_t MAX64 = std::numeric_limits::max(); const std::vector> merge_inputs_32 = { {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, - {1, 2, 1, 4, 5, MAX32}, - {1, 2, MAX32, 4, 5, 4}, - {1, 1, 0, 1, 1, 0}, - {1, 2, 1, 4, 5, 4}}, - {6, - {1, 2, 2, 2, 2, 6}, - {1, 1, 1, 5, 5, 5}, - {1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1}}, + {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, + {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX32, 1, 3, MAX32}, {1, 2, 3, 2, MAX32, 2, 2, 2}, @@ -117,16 +106,8 @@ const std::vector> merge_inputs_32 = { const std::vector> merge_inputs_64 = { {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}}, {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}, - {6, - {1, 2, 1, 4, 5, MAX64}, - {1, 2, MAX64, 4, 5, 4}, - {1, 1, 0, 1, 1, 0}, - {1, 2, 1, 4, 5, 4}}, - {6, - {1, 2, 2, 2, 2, 6}, - {1, 1, 1, 5, 5, 5}, - {1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1}}, + {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}}, + {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}}, {8, {1, 1, 3, 3, MAX64, 1, 3, MAX64}, {1, 2, 3, 2, MAX64, 2, 2, 2}, @@ -139,10 +120,8 @@ const std::vector> merge_inputs_64 = { {1, 1, 1, 1, 1, 7, 7, 7}}, }; -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, - ::testing::ValuesIn(merge_inputs_32)); -INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, - ::testing::ValuesIn(merge_inputs_64)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32)); +INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64)); } // namespace label } // namespace raft diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu index 08429e18f2..183c0bd2f3 100644 --- a/cpp/test/lap/lap.cu +++ b/cpp/test/lap/lap.cu @@ -31,11 +31,11 @@ #include #include -#define PROBLEMSIZE 1000 // Number of rows/columns -#define BATCHSIZE 10 // Number of problems in the batch -#define COSTRANGE 1000 +#define PROBLEMSIZE 1000 // Number of rows/columns +#define BATCHSIZE 10 // Number of problems in the batch +#define COSTRANGE 1000 #define PROBLEMCOUNT 1 -#define REPETITIONS 1 +#define REPETITIONS 1 #define SEED 01010001 @@ -45,38 +45,41 @@ namespace raft { // Function for generating problem with uniformly distributed integer costs between [0, COSTRANGE]. template -void generateProblem(weight_t *cost_matrix, int SP, int N, int costrange) { +void generateProblem(weight_t* cost_matrix, int SP, int N, int costrange) +{ long N2 = SP * N * N; std::uniform_int_distribution distribution(0, costrange); for (long i = 0; i < N2; i++) { - int val = distribution(generator); + int val = distribution(generator); cost_matrix[i] = (weight_t)val; } } template -void hungarian_test(int problemsize, int costrange, int problemcount, - int repetitions, int batchsize, weight_t epsilon, - bool verbose = false) { +void hungarian_test(int problemsize, + int costrange, + int problemcount, + int repetitions, + int batchsize, + weight_t epsilon, + bool verbose = false) +{ raft::handle_t handle; - weight_t *h_cost = new weight_t[batchsize * problemsize * problemsize]; + weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize]; for (int j = 0; j < problemcount; j++) { generateProblem(h_cost, batchsize, problemsize, costrange); - rmm::device_uvector elements_v( - batchsize * problemsize * problemsize, handle.get_stream()); - rmm::device_uvector row_assignment_v(batchsize * problemsize, - handle.get_stream()); - rmm::device_uvector col_assignment_v(batchsize * problemsize, - handle.get_stream()); + rmm::device_uvector elements_v(batchsize * problemsize * problemsize, + handle.get_stream()); + rmm::device_uvector row_assignment_v(batchsize * problemsize, handle.get_stream()); + rmm::device_uvector col_assignment_v(batchsize * problemsize, handle.get_stream()); - raft::update_device(elements_v.data(), h_cost, - batchsize * problemsize * problemsize, - handle.get_stream()); + raft::update_device( + elements_v.data(), h_cost, batchsize * problemsize * problemsize, handle.get_stream()); for (int i = 0; i < repetitions; i++) { float start = omp_get_wtime(); @@ -86,20 +89,18 @@ void hungarian_test(int problemsize, int costrange, int problemcount, handle, problemsize, batchsize, epsilon); // Solve LAP(s) for given cost matrix - lpx.solve(elements_v.data(), row_assignment_v.data(), - col_assignment_v.data()); + lpx.solve(elements_v.data(), row_assignment_v.data(), col_assignment_v.data()); float end = omp_get_wtime(); float total_time = (end - start); if (verbose) { - // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual objectives. At optimality both values should match. + // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual + // objectives. At optimality both values should match. for (int k = 0; k < batchsize; k++) { - std::cout << j << ":" << i << ":" << k << ":" - << lpx.getPrimalObjectiveValue(k) << ":" - << lpx.getDualObjectiveValue(k) << ":" << total_time - << std::endl; + std::cout << j << ":" << i << ":" << k << ":" << lpx.getPrimalObjectiveValue(k) << ":" + << lpx.getDualObjectiveValue(k) << ":" << total_time << std::endl; } } } @@ -108,34 +109,38 @@ void hungarian_test(int problemsize, int costrange, int problemcount, delete[] h_cost; } -TEST(Raft, HungarianIntFloat) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianIntFloat) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianIntDouble) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianIntDouble) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianIntLong) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, long{0}); +TEST(Raft, HungarianIntLong) +{ + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); } -TEST(Raft, HungarianLongFloat) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, float{1e-6}); +TEST(Raft, HungarianLongFloat) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6}); } -TEST(Raft, HungarianLongDouble) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, - REPETITIONS, BATCHSIZE, double{1e-6}); +TEST(Raft, HungarianLongDouble) +{ + hungarian_test( + PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6}); } -TEST(Raft, HungarianLongLong) { - hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, - BATCHSIZE, long{0}); +TEST(Raft, HungarianLongLong) +{ + hungarian_test(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0}); } } // namespace raft diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index 48ad83dfd2..17b000044e 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -33,10 +33,13 @@ class AddTest : public ::testing::TestWithParam> { in1(params.len, stream), in2(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -47,9 +50,10 @@ class AddTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); } - void compare() { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); + void compare() + { + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } protected: diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh index 137419758f..1d9352bfc1 100644 --- a/cpp/test/linalg/add.cuh +++ b/cpp/test/linalg/add.cuh @@ -23,18 +23,17 @@ namespace raft { namespace linalg { template -__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2, - int len) { +__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = OutT(in1[idx] + in2[idx]); - } + if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); } } template -void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) { +void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -47,8 +46,8 @@ struct AddInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const AddInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const AddInputs& dims) +{ return os; } diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index c8121bfbe4..c833faa0b2 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -29,28 +29,29 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2, - IdxType len, cudaStream_t stream) { +void binaryOpLaunch( + OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream) +{ binaryOp( - out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, - stream); + out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream); } template -class BinaryOpTest - : public ::testing::TestWithParam> { +class BinaryOpTest : public ::testing::TestWithParam> { public: BinaryOpTest() - : params(::testing::TestWithParam< - BinaryOpInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in1(params.len, stream), in2(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); IdxType len = params.len; r.uniform(in1.data(), len, InType(-1.0), InType(1.0), stream); @@ -71,67 +72,66 @@ class BinaryOpTest rmm::device_uvector out; }; -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32; -TEST_P(BinaryOpTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i64; -TEST_P(BinaryOpTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestF_i32_D; -TEST_P(BinaryOpTestF_i32_D, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestF_i32_D, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, - ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i32; -TEST_P(BinaryOpTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL}}; typedef BinaryOpTest BinaryOpTestD_i64; -TEST_P(BinaryOpTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(BinaryOpTestD_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); template class BinaryOpAlignment : public ::testing::Test { protected: - BinaryOpAlignment() { + BinaryOpAlignment() + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void Misaligned() { + void Misaligned() + { // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly // chosen. int n = 1024; @@ -141,8 +141,12 @@ class BinaryOpAlignment : public ::testing::Test { CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream)); CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream)); raft::linalg::binaryOp( - z.data() + 9, x.data() + 137, y.data() + 19, 256, - [] __device__(math_t x, math_t y) { return x + y; }, stream); + z.data() + 9, + x.data() + 137, + y.data() + 19, + 256, + [] __device__(math_t x, math_t y) { return x + y; }, + stream); } raft::handle_t handle; diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh index fd8ed6dd1e..97cb3ecb24 100644 --- a/cpp/test/linalg/binary_op.cuh +++ b/cpp/test/linalg/binary_op.cuh @@ -24,18 +24,17 @@ namespace raft { namespace linalg { template -__global__ void naiveAddKernel(OutType *out, const InType *in1, - const InType *in2, IdxType len) { +__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); - if (idx < len) { - out[idx] = static_cast(in1[idx] + in2[idx]); - } + if (idx < len) { out[idx] = static_cast(in1[idx] + in2[idx]); } } template -void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) { +void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len) +{ static const IdxType TPB = 64; - IdxType nblks = raft::ceildiv(len, TPB); + IdxType nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -48,8 +47,8 @@ struct BinaryOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const BinaryOpInputs &d) { +::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs& d) +{ return os; } diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu index 262a1ad26c..6c7bbd1232 100644 --- a/cpp/test/linalg/cholesky_r1.cu +++ b/cpp/test/linalg/cholesky_r1.cu @@ -36,7 +36,8 @@ class CholeskyR1Test : public ::testing::Test { L(n_rows * n_rows, handle.get_stream()), L_exp(n_rows * n_rows, handle.get_stream()), devInfo(handle.get_stream()), - workspace(0, handle.get_stream()) { + workspace(0, handle.get_stream()) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(G.data(), G_host, n_rows * n_rows, stream); @@ -48,55 +49,58 @@ class CholeskyR1Test : public ::testing::Test { int n_bytes = 0; // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace // requirements. - raft::linalg::choleskyRank1Update(handle, L.data(), n_rows, n_rows, nullptr, - &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); + raft::linalg::choleskyRank1Update( + handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream); Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes); workspace.resize(Lwork, stream); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testR1Update() { + void testR1Update() + { int n = n_rows * n_rows; - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, - CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), n, stream); for (int rank = 1; rank <= n_rows; rank++) { std::stringstream ss; - ss << "Rank " << rank - << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); + ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper"); SCOPED_TRACE(ss.str()); // Expected solution using Cholesky factorization from scratch raft::copy(L_exp.data(), G.data(), n, stream); - CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf( - solver_handle, uplo, rank, L_exp.data(), n_rows, - (math_t*)workspace.data(), Lwork, devInfo.data(), stream)); + CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle, + uplo, + rank, + L_exp.data(), + n_rows, + (math_t*)workspace.data(), + Lwork, + devInfo.data(), + stream)); // Incremental Cholesky factorization using rank one updates. - raft::linalg::choleskyRank1Update(handle, L.data(), rank, n_rows, - workspace.data(), &Lwork, uplo, - stream); + raft::linalg::choleskyRank1Update( + handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream); - ASSERT_TRUE(raft::devArrMatch(L_exp.data(), L.data(), n_rows * rank, - raft::CompareApprox(3e-3))); + ASSERT_TRUE(raft::devArrMatch( + L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox(3e-3))); } } } - void testR1Error() { + void testR1Error() + { raft::update_device(G.data(), G2_host, 4, stream); - std::vector fillmode{CUBLAS_FILL_MODE_LOWER, - CUBLAS_FILL_MODE_UPPER}; + std::vector fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER}; for (auto uplo : fillmode) { raft::copy(L.data(), G.data(), 4, stream); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream)); - ASSERT_THROW( - raft::linalg::choleskyRank1Update( - handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), - raft::exception); + ASSERT_THROW(raft::linalg::choleskyRank1Update( + handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream), + raft::exception); math_t eps = std::numeric_limits::epsilon(); ASSERT_NO_THROW(raft::linalg::choleskyRank1Update( diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index fdfc3052b7..9bb84e1eb7 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -33,8 +33,8 @@ struct coalescedReductionInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const coalescedReductionInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs& dims) +{ return os; } @@ -42,25 +42,28 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows, - cudaStream_t stream, bool inplace = false) { - coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace, - [] __device__(T in, int i) { return in * in; }); +void coalescedReductionLaunch( + T* dots, const T* data, int cols, int rows, cudaStream_t stream, bool inplace = false) +{ + coalescedReduction( + dots, data, cols, rows, (T)0, stream, inplace, [] __device__(T in, int i) { return in * in; }); } template -class coalescedReductionTest - : public ::testing::TestWithParam> { +class coalescedReductionTest : public ::testing::TestWithParam> { public: coalescedReductionTest() : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(params.rows * params.cols, stream), dots_exp(params.rows * params.cols, stream), - dots_act(params.rows * params.cols, stream) {} + dots_act(params.rows * params.cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; @@ -70,8 +73,7 @@ class coalescedReductionTest // Perform reduction with default inplace = false first coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream); // Add to result with inplace = true next - coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream, - true); + coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream, true); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -86,34 +88,36 @@ class coalescedReductionTest rmm::device_uvector dots_act; }; -const std::vector> inputsf = { - {0.000002f, 1024, 32, 1234ULL}, - {0.000002f, 1024, 64, 1234ULL}, - {0.000002f, 1024, 128, 1234ULL}, - {0.000002f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = {{0.000002f, 1024, 32, 1234ULL}, + {0.000002f, 1024, 64, 1234ULL}, + {0.000002f, 1024, 128, 1234ULL}, + {0.000002f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = { - {0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef coalescedReductionTest coalescedReductionTestF; -TEST_P(coalescedReductionTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox(params.tolerance))); } typedef coalescedReductionTest coalescedReductionTestD; -TEST_P(coalescedReductionTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(coalescedReductionTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, + coalescedReductionTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD, +INSTANTIATE_TEST_CASE_P(coalescedReductionTests, + coalescedReductionTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index d90955147c..130a22abf0 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -25,37 +25,36 @@ namespace raft { namespace linalg { template -__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar, - int len) { +__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in[idx] / scalar; - } + if (idx < len) { out[idx] = in[idx] / scalar; } } template -void naiveDivide(Type *out, const Type *in, Type scalar, int len, - cudaStream_t stream) { +void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveDivideKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } template -class DivideTest - : public ::testing::TestWithParam> { +class DivideTest : public ::testing::TestWithParam> { public: DivideTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.len; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -75,25 +74,23 @@ class DivideTest rmm::device_uvector out; }; -const std::vector> inputsf = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef DivideTest DivideTestF; -TEST_P(DivideTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(DivideTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf)); typedef DivideTest DivideTestD; -const std::vector> inputsd = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(DivideTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(DivideTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 2ac9118506..3df3abd2af 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -35,7 +35,8 @@ struct EigInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const EigInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EigInputs& dims) +{ return os; } @@ -56,34 +57,60 @@ class EigTest : public ::testing::TestWithParam> { eig_vectors_large(params.n * params.n, stream), eig_vectors_jacobi_large(params.n * params.n, stream), eig_vals_large(params.n, stream), - eig_vals_jacobi_large(params.n, stream) {} + eig_vals_jacobi_large(params.n, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.len; - T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, - 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = { + 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream); - T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874, - 0.4874, -0.5123, 0.6498, 0.2789, -0.2789, -0.6498, - 0.4874, 0.5123, 0.5123, 0.4874}; - T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; + T eig_vectors_ref_h[] = {0.2790, + -0.6498, + 0.6498, + -0.2789, + -0.5123, + 0.4874, + 0.4874, + -0.5123, + 0.6498, + 0.2789, + -0.2789, + -0.6498, + 0.4874, + 0.5123, + 0.5123, + 0.4874}; + T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266}; raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, len, stream); - raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n_col, - stream); + raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n_col, stream); - eigDC(handle, cov_matrix.data(), params.n_row, params.n_col, - eig_vectors.data(), eig_vals.data(), stream); + eigDC(handle, + cov_matrix.data(), + params.n_row, + params.n_col, + eig_vectors.data(), + eig_vals.data(), + stream); - T tol = 1.e-7; + T tol = 1.e-7; int sweeps = 15; - eigJacobi(handle, cov_matrix.data(), params.n_row, params.n_col, - eig_vectors_jacobi.data(), eig_vals_jacobi.data(), stream, tol, + eigJacobi(handle, + cov_matrix.data(), + params.n_row, + params.n_col, + eig_vectors_jacobi.data(), + eig_vals_jacobi.data(), + stream, + tol, sweeps); // test code for comparing two methods @@ -91,11 +118,22 @@ class EigTest : public ::testing::TestWithParam> { r.uniform(cov_matrix_large.data(), len, T(-1.0), T(1.0), stream); - eigDC(handle, cov_matrix_large.data(), params.n, params.n, - eig_vectors_large.data(), eig_vals_large.data(), stream); - eigJacobi(handle, cov_matrix_large.data(), params.n, params.n, - eig_vectors_jacobi_large.data(), eig_vals_jacobi_large.data(), - stream, tol, sweeps); + eigDC(handle, + cov_matrix_large.data(), + params.n, + params.n, + eig_vectors_large.data(), + eig_vals_large.data(), + stream); + eigJacobi(handle, + cov_matrix_large.data(), + params.n, + params.n, + eig_vectors_jacobi_large.data(), + eig_vals_jacobi_large.data(), + stream, + tol, + sweeps); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -105,87 +143,105 @@ class EigTest : public ::testing::TestWithParam> { EigInputs params; - rmm::device_uvector cov_matrix, eig_vectors, eig_vectors_jacobi, - eig_vectors_ref, eig_vals, eig_vals_jacobi, eig_vals_ref; + rmm::device_uvector cov_matrix, eig_vectors, eig_vectors_jacobi, eig_vectors_ref, eig_vals, + eig_vals_jacobi, eig_vals_ref; - rmm::device_uvector cov_matrix_large, eig_vectors_large, - eig_vectors_jacobi_large, eig_vals_large, eig_vals_jacobi_large; + rmm::device_uvector cov_matrix_large, eig_vectors_large, eig_vectors_jacobi_large, + eig_vals_large, eig_vals_jacobi_large; }; -const std::vector> inputsf2 = { - {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = { - {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigTest EigTestValF; -TEST_P(EigTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValD; -TEST_P(EigTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecF; -TEST_P(EigTestVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecD; -TEST_P(EigTestVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiF; -TEST_P(EigTestValJacobiF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals_jacobi.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestValJacobiD; -TEST_P(EigTestValJacobiD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestValJacobiD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals_jacobi.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiF; -TEST_P(EigTestVecJacobiF, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors_jacobi.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecJacobiD; -TEST_P(EigTestVecJacobiD, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecJacobiD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors_jacobi.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareF; -TEST_P(EigTestVecCompareF, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_large.data(), eig_vectors_jacobi_large.data(), - (params.n * params.n), raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(), + eig_vectors_jacobi_large.data(), + (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } typedef EigTest EigTestVecCompareD; -TEST_P(EigTestVecCompareD, Result) { - ASSERT_TRUE(raft::devArrMatch( - eig_vectors_large.data(), eig_vectors_jacobi_large.data(), - (params.n * params.n), raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigTestVecCompareD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(), + eig_vectors_jacobi_large.data(), + (params.n * params.n), + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2)); @@ -196,17 +252,13 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index 9eb1c10313..b1e88c91dd 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -37,7 +37,8 @@ struct EigSelInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const EigSelInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EigSelInputs& dims) +{ return os; } @@ -51,27 +52,46 @@ class EigSelTest : public ::testing::TestWithParam> { eig_vectors(12, stream), eig_vectors_ref(12, stream), eig_vals(params.n_col, stream), - eig_vals_ref(params.n_col, stream) {} + eig_vals_ref(params.n_col, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { int len = params.len; - T cov_matrix_h[] = {1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, - 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; + T cov_matrix_h[] = { + 1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0}; ASSERT(len == 16, "This test only works with 4x4 matrices!"); raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream); - T eig_vectors_ref_h[] = {-0.5123, 0.4874, 0.4874, -0.5123, 0.6498, 0.2789, - -0.2789, -0.6498, 0.4874, 0.5123, 0.5123, 0.4874}; - T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; + T eig_vectors_ref_h[] = {-0.5123, + 0.4874, + 0.4874, + -0.5123, + 0.6498, + 0.2789, + -0.2789, + -0.6498, + 0.4874, + 0.5123, + 0.5123, + 0.4874}; + T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266}; raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream); raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream); - eigSelDC(handle, cov_matrix.data(), params.n_row, params.n_col, 3, - eig_vectors.data(), eig_vals.data(), - EigVecMemUsage::OVERWRITE_INPUT, stream); + eigSelDC(handle, + cov_matrix.data(), + params.n_row, + params.n_col, + 3, + eig_vectors.data(), + eig_vals.data(), + EigVecMemUsage::OVERWRITE_INPUT, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -87,51 +107,53 @@ class EigSelTest : public ::testing::TestWithParam> { rmm::device_uvector eig_vals_ref; }; -const std::vector> inputsf2 = { - {0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; -const std::vector> inputsd2 = { - {0.001, 4 * 4, 4, 4, 1234ULL, 256}}; +const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; typedef EigSelTest EigSelTestValF; -TEST_P(EigSelTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestValD; -TEST_P(EigSelTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(), + eig_vals.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecF; -TEST_P(EigSelTestVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors.data(), + 12, + raft::CompareApproxAbs(params.tolerance))); } typedef EigSelTest EigSelTestVecD; -TEST_P(EigSelTestVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(EigSelTestVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(), + eig_vectors.data(), + 12, + raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index c3b26f5423..5ecca16be6 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -26,19 +26,17 @@ namespace linalg { //// Testing unary ops template -__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar, - int len) { +__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = scalar * in[idx]; - } + if (idx < len) { out[idx] = scalar * in[idx]; } } template -void naiveScale(Type *out, const Type *in, Type scalar, int len, - cudaStream_t stream) { +void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveScaleKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -52,26 +50,28 @@ struct ScalarMultiplyInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const ScalarMultiplyInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const ScalarMultiplyInputs& dims) +{ return os; } template -class ScalarMultiplyTest - : public ::testing::TestWithParam> { +class ScalarMultiplyTest : public ::testing::TestWithParam> { public: ScalarMultiplyTest() : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in(len, stream), out_ref(len, stream), - out(len, stream) {} + out(len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); - int len = params.len; + int len = params.len; T scalar = params.scalar; r.uniform(in, len, T(-1.0), T(1.0), stream); naiveScale(out_ref, in, scalar, len, stream); @@ -87,46 +87,43 @@ class ScalarMultiplyTest rmm::device_uvector in, out_ref, out; }; -const std::vector> inputsf1 = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf1 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; const std::vector> inputsd1 = { {0.00000001, 1024 * 1024, 2.0, 1234ULL}}; typedef ScalarMultiplyTest ScalarMultiplyTestF; -TEST_P(ScalarMultiplyTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } typedef ScalarMultiplyTest ScalarMultiplyTestD; -TEST_P(ScalarMultiplyTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(ScalarMultiplyTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, - ::testing::ValuesIn(inputsf1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1)); -INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, - ::testing::ValuesIn(inputsd1)); +INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::ValuesIn(inputsd1)); //// Testing binary ops template -__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2, - int len) { +__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] + in2[idx]; - } + if (idx < len) { out[idx] = in1[idx] + in2[idx]; } } template -void naiveAdd(Type *out, const Type *in1, const Type *in2, int len, - cudaStream_t stream) { +void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveAddKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -139,8 +136,8 @@ struct EltwiseAddInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const EltwiseAddInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const EltwiseAddInputs& dims) +{ return os; } @@ -153,10 +150,13 @@ class EltwiseAddTest : public ::testing::TestWithParam> { in1(params.len, stream), in2(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -175,29 +175,27 @@ class EltwiseAddTest : public ::testing::TestWithParam> { rmm::device_uvector in1, in2, out_ref, out; }; -const std::vector> inputsf2 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef EltwiseAddTest EltwiseAddTestF; -TEST_P(EltwiseAddTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } typedef EltwiseAddTest EltwiseAddTestD; -TEST_P(EltwiseAddTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(EltwiseAddTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu index 699d40d55e..6231715c8a 100644 --- a/cpp/test/linalg/gemm_layout.cu +++ b/cpp/test/linalg/gemm_layout.cu @@ -36,9 +36,9 @@ struct GemmLayoutInputs { // Reference GEMM implementation. template -__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, - bool isZColMajor, bool isXColMajor, - bool isYColMajor) { +__global__ void naiveGemm( + T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor) +{ int tidx = blockIdx.x * blockDim.x + threadIdx.x; int tidy = blockIdx.y * blockDim.y + threadIdx.y; @@ -51,7 +51,7 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, temp += X[xIndex] * Y[yIndex]; } int zIndex = isZColMajor ? m + n * M : m * N + n; - Z[zIndex] = temp; + Z[zIndex] = temp; } } } @@ -59,7 +59,8 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K, template class GemmLayoutTest : public ::testing::TestWithParam> { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; @@ -72,8 +73,8 @@ class GemmLayoutTest : public ::testing::TestWithParam> { // Dimensions of Y : K x N // Dimensions of Z : M x N - T *X = NULL; // Argument X - T *Y = NULL; // Argument Y + T* X = NULL; // Argument X + T* Y = NULL; // Argument Y size_t xElems = params.M * params.K; size_t yElems = params.K * params.N; @@ -87,27 +88,35 @@ class GemmLayoutTest : public ::testing::TestWithParam> { r.uniform(X, xElems, T(-10.0), T(10.0), stream); r.uniform(Y, yElems, T(-10.0), T(10.0), stream); - dim3 blocks(raft::ceildiv(params.M, 128), - raft::ceildiv(params.N, 4), 1); + dim3 blocks(raft::ceildiv(params.M, 128), raft::ceildiv(params.N, 4), 1); dim3 threads(128, 4, 1); - naiveGemm<<>>(refZ, X, Y, params.M, params.N, params.K, - params.zLayout, params.xLayout, - params.yLayout); - - gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout, - params.xLayout, params.yLayout, stream); + naiveGemm<<>>( + refZ, X, Y, params.M, params.N, params.K, params.zLayout, params.xLayout, params.yLayout); + + gemm(handle, + Z, + X, + Y, + params.M, + params.N, + params.K, + params.zLayout, + params.xLayout, + params.yLayout, + stream); } - void TearDown() override { + void TearDown() override + { CUDA_CHECK(cudaFree(refZ)); CUDA_CHECK(cudaFree(Z)); } protected: GemmLayoutInputs params; - T *refZ = NULL; // Reference result for comparison - T *Z = NULL; // Computed result + T* refZ = NULL; // Reference result for comparison + T* Z = NULL; // Computed result }; const std::vector> inputsf = { @@ -131,22 +140,20 @@ const std::vector> inputsd = { {50, 80, 60, false, false, false, 893038ULL}}; typedef GemmLayoutTest GemmLayoutTestF; -TEST_P(GemmLayoutTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, - raft::CompareApprox(1e-4))); +TEST_P(GemmLayoutTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-4))); } typedef GemmLayoutTest GemmLayoutTestD; -TEST_P(GemmLayoutTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, - raft::CompareApprox(1e-6))); +TEST_P(GemmLayoutTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox(1e-6))); } -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu index 92e59ae49b..4d5472f38c 100644 --- a/cpp/test/linalg/gemv.cu +++ b/cpp/test/linalg/gemv.cu @@ -34,10 +34,16 @@ struct GemvInputs { // Reference GEMV implementation. template -__global__ void naiveGemv(T *y, const T *A, const T *x, const int n_rows, - const int n_cols, const int lda, const bool trans_a) { +__global__ void naiveGemv(T* y, + const T* A, + const T* x, + const int n_rows, + const int n_cols, + const int lda, + const bool trans_a) +{ int istart = blockIdx.x * blockDim.x + threadIdx.x; - int istep = blockDim.x * gridDim.x; + int istep = blockDim.x * gridDim.x; if (!trans_a) { for (int i = istart; i < n_rows; i += istep) { @@ -69,12 +75,14 @@ class GemvTest : public ::testing::TestWithParam> { GemvTest() : testing::TestWithParam>(), refy(0, rmm::cuda_stream_default), - y(0, rmm::cuda_stream_default) { + y(0, rmm::cuda_stream_default) + { rmm::cuda_stream_default.synchronize(); } protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::handle_t handle; @@ -98,39 +106,55 @@ class GemvTest : public ::testing::TestWithParam> { dim3 blocks(raft::ceildiv(yElems, 256), 1, 1); dim3 threads(256, 1, 1); - naiveGemv<<>>(refy.data(), A.data(), x.data(), - params.n_rows, params.n_cols, params.lda, - params.trans_a); - - gemv(handle, A.data(), params.n_rows, params.n_cols, params.lda, x.data(), - y.data(), params.trans_a, stream); + naiveGemv<<>>( + refy.data(), A.data(), x.data(), params.n_rows, params.n_cols, params.lda, params.trans_a); + + gemv(handle, + A.data(), + params.n_rows, + params.n_cols, + params.lda, + x.data(), + y.data(), + params.trans_a, + stream); } void TearDown() override {} }; -const std::vector> inputsf = { - {80, 70, 80, true, 76433ULL}, {80, 100, 80, true, 426646ULL}, - {20, 100, 20, true, 37703ULL}, {100, 60, 200, true, 538004ULL}, - {50, 10, 60, false, 73012ULL}, {90, 90, 90, false, 538147ULL}, - {30, 100, 30, false, 412352ULL}, {40, 80, 100, false, 297941ULL}}; - -const std::vector> inputsd = { - {10, 70, 10, true, 535648ULL}, {30, 30, 30, true, 956681ULL}, - {70, 80, 70, true, 875083ULL}, {80, 90, 200, true, 50744ULL}, - {90, 90, 90, false, 506321ULL}, {40, 100, 70, false, 638418ULL}, - {80, 50, 80, false, 701529ULL}, {50, 80, 60, false, 893038ULL}}; +const std::vector> inputsf = {{80, 70, 80, true, 76433ULL}, + {80, 100, 80, true, 426646ULL}, + {20, 100, 20, true, 37703ULL}, + {100, 60, 200, true, 538004ULL}, + {50, 10, 60, false, 73012ULL}, + {90, 90, 90, false, 538147ULL}, + {30, 100, 30, false, 412352ULL}, + {40, 80, 100, false, 297941ULL}}; + +const std::vector> inputsd = {{10, 70, 10, true, 535648ULL}, + {30, 30, 30, true, 956681ULL}, + {70, 80, 70, true, 875083ULL}, + {80, 90, 200, true, 50744ULL}, + {90, 90, 90, false, 506321ULL}, + {40, 100, 70, false, 638418ULL}, + {80, 50, 80, false, 701529ULL}, + {50, 80, 60, false, 893038ULL}}; typedef GemvTest GemvTestF; -TEST_P(GemvTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(), +TEST_P(GemvTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refy.data(), + y.data(), params.trans_a ? params.n_cols : params.n_rows, raft::CompareApprox(1e-4))); } typedef GemvTest GemvTestD; -TEST_P(GemvTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(), +TEST_P(GemvTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(refy.data(), + y.data(), params.trans_a ? params.n_cols : params.n_rows, raft::CompareApprox(1e-6))); } diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index f04c225aa9..787d9ba415 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -25,13 +25,22 @@ namespace raft { namespace linalg { template -void mapLaunch(OutType *out, const InType *in1, const InType *in2, - const InType *in3, InType scalar, IdxType len, - cudaStream_t stream) { +void mapLaunch(OutType* out, + const InType* in1, + const InType* in2, + const InType* in3, + InType scalar, + IdxType len, + cudaStream_t stream) +{ map( - out, len, + out, + len, [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; }, - stream, in1, in2, in3); + stream, + in1, + in2, + in3); } template @@ -43,9 +52,14 @@ struct MapInputs { }; template -void create_ref(OutType *out_ref, const InType *in1, const InType *in2, - const InType *in3, InType scalar, IdxType len, - cudaStream_t stream) { +void create_ref(OutType* out_ref, + const InType* in1, + const InType* in2, + const InType* in3, + InType scalar, + IdxType len, + cudaStream_t stream) +{ rmm::device_uvector tmp(len, stream); eltwiseAdd(tmp.data(), in1, in2, len, stream); eltwiseAdd(out_ref, tmp.data(), in3, len, stream); @@ -54,21 +68,22 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2, } template -class MapTest - : public ::testing::TestWithParam> { +class MapTest : public ::testing::TestWithParam> { public: MapTest() - : params(::testing::TestWithParam< - MapInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in1(params.len, stream), in2(params.len, stream), in3(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); IdxType len = params.len; @@ -76,10 +91,8 @@ class MapTest r.uniform(in2.data(), len, InType(-1.0), InType(1.0), stream); r.uniform(in3.data(), len, InType(-1.0), InType(1.0), stream); - create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(), - params.scalar, len, stream); - mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar, - len, stream); + create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream); + mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -92,55 +105,52 @@ class MapTest rmm::device_uvector out_ref, out; }; -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 1234ULL, 3.2}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}}; typedef MapTest MapTestF_i32; -TEST_P(MapTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32)); -const std::vector> inputsf_i64 = { - {0.000001f, 1024 * 1024, 1234ULL, 9.4}}; +const std::vector> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}}; typedef MapTest MapTestF_i64; -TEST_P(MapTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsf_i32_d = { {0.000001f, 1024 * 1024, 1234ULL, 5.9}}; typedef MapTest MapTestF_i32_D; -TEST_P(MapTestF_i32_D, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestF_i32_D, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, - ::testing::ValuesIn(inputsf_i32_d)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d)); -const std::vector> inputsd_i32 = { - {0.00000001, 1024 * 1024, 1234ULL, 7.5}}; +const std::vector> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}}; typedef MapTest MapTestD_i32; -TEST_P(MapTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestD_i32, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.00000001, 1024 * 1024, 1234ULL, 5.2}}; typedef MapTest MapTestD_i64; -TEST_P(MapTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapTestD_i64, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // namespace linalg } // namespace raft diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index 9d59e49e60..1594cc3544 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -27,21 +27,18 @@ namespace raft { namespace linalg { template -__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len, - MapOp map) { +__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - raft::myAtomicAdd(out, (OutType)map(in[idx])); - } + if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); } } template -void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map, - cudaStream_t stream) { +void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, (size_t)TPB); - naiveMapReduceKernel - <<>>(out, in, len, map); + int nblks = raft::ceildiv(len, (size_t)TPB); + naiveMapReduceKernel<<>>(out, in, len, map); CUDA_CHECK(cudaPeekAtLastError()); } @@ -53,7 +50,8 @@ struct MapReduceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MapReduceInputs& dims) +{ return os; } @@ -61,8 +59,9 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in, - size_t len, cudaStream_t stream) { +void mapReduceLaunch( + OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream) +{ auto op = [] __device__(InType in) { return in; }; naiveMapReduce(out_ref, in, len, op, stream); mapThenSumReduce(out, len, op, 0, in); @@ -78,10 +77,12 @@ class MapReduceTest : public ::testing::TestWithParam> { out_ref(params.len, stream), out(params.len, stream) - {} + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); auto len = params.len; r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream); @@ -98,42 +99,40 @@ class MapReduceTest : public ::testing::TestWithParam> { rmm::device_uvector out_ref, out; }; -const std::vector> inputsf = { - {0.001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = {{0.001f, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestFF; -TEST_P(MapReduceTestFF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFF, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf)); typedef MapReduceTest MapReduceTestFD; -TEST_P(MapReduceTestFD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestFD, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf)); -const std::vector> inputsd = { - {0.000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = {{0.000001, 1024 * 1024, 1234ULL}}; typedef MapReduceTest MapReduceTestDD; -TEST_P(MapReduceTestDD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MapReduceTestDD, Result) +{ + ASSERT_TRUE( + devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd)); template class MapGenericReduceTest : public ::testing::Test { - using InType = typename T::first_type; + using InType = typename T::first_type; using OutType = typename T::second_type; protected: - MapGenericReduceTest() - : input(n, handle.get_stream()), output(handle.get_stream()) { + MapGenericReduceTest() : input(n, handle.get_stream()), output(handle.get_stream()) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); initInput(input.data(), input.size(), stream); @@ -142,7 +141,8 @@ class MapGenericReduceTest : public ::testing::Test { void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } public: - void initInput(InType *input, int n, cudaStream_t stream) { + void initInput(InType* input, int n, cudaStream_t stream) + { raft::random::Rng r(137); r.uniform(input, n, InType(2), InType(3), stream); InType val = 1; @@ -151,21 +151,19 @@ class MapGenericReduceTest : public ::testing::Test { raft::update_device(input + 337, &val, 1, stream); } - void testMin() { - auto op = [] __device__(InType in) { return in; }; + void testMin() + { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::max(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, - input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, - raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare())); } - void testMax() { - auto op = [] __device__(InType in) { return in; }; + void testMax() + { + auto op = [] __device__(InType in) { return in; }; const OutType neutral = std::numeric_limits::min(); - mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, - input.data()); - EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, - raft::Compare())); + mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data()); + EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare())); } protected: @@ -178,8 +176,7 @@ class MapGenericReduceTest : public ::testing::Test { }; using IoTypePair = - ::testing::Types, std::pair, - std::pair>; + ::testing::Types, std::pair, std::pair>; TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair); TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); } diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index aad1d1e137..3db7c53041 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -32,8 +32,8 @@ struct MatVecOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const MatVecOpInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs& dims) +{ return os; } @@ -41,24 +41,45 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, bool useTwoVectors, - cudaStream_t stream) { +void matrixVectorOpLaunch(T* out, + const T* in, + const T* vec1, + const T* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + bool useTwoVectors, + cudaStream_t stream) +{ if (useTwoVectors) { matrixVectorOp( - out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows, - [] __device__(T a, T b, T c) { return a + b + c; }, stream); + out, + in, + vec1, + vec2, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(T a, T b, T c) { return a + b + c; }, + stream); } else { matrixVectorOp( - out, in, vec1, D, N, rowMajor, bcastAlongRows, - [] __device__(T a, T b) { return a + b; }, stream); + out, + in, + vec1, + D, + N, + rowMajor, + bcastAlongRows, + [] __device__(T a, T b) { return a + b; }, + stream); } } template -class MatVecOpTest - : public ::testing::TestWithParam> { +class MatVecOpTest : public ::testing::TestWithParam> { public: MatVecOpTest() : params(::testing::TestWithParam>::GetParam()), @@ -67,27 +88,50 @@ class MatVecOpTest out_ref(params.rows * params.cols, stream), out(params.rows * params.cols, stream), vec1(params.bcastAlongRows ? params.cols : params.rows, stream), - vec2(params.bcastAlongRows ? params.cols : params.rows, stream) {} + vec2(params.bcastAlongRows ? params.cols : params.rows, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); IdxType N = params.rows, D = params.cols; - IdxType len = N * D; + IdxType len = N * D; IdxType vecLen = params.bcastAlongRows ? D : N; r.uniform(in.data(), len, (T)-1.0, (T)1.0, stream); r.uniform(vec1.data(), vecLen, (T)-1.0, (T)1.0, stream); r.uniform(vec2.data(), vecLen, (T)-1.0, (T)1.0, stream); if (params.useTwoVectors) { - naiveMatVec(out_ref.data(), in.data(), vec1.data(), vec2.data(), D, N, - params.rowMajor, params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref.data(), + in.data(), + vec1.data(), + vec2.data(), + D, + N, + params.rowMajor, + params.bcastAlongRows, + (T)1.0); } else { - naiveMatVec(out_ref.data(), in.data(), vec1.data(), D, N, params.rowMajor, - params.bcastAlongRows, (T)1.0); + naiveMatVec(out_ref.data(), + in.data(), + vec1.data(), + D, + N, + params.rowMajor, + params.bcastAlongRows, + (T)1.0); } - matrixVectorOpLaunch(out.data(), in.data(), vec1.data(), vec2.data(), D, N, - params.rowMajor, params.bcastAlongRows, - params.useTwoVectors, stream); + matrixVectorOpLaunch(out.data(), + in.data(), + vec1.data(), + vec2.data(), + D, + N, + params.rowMajor, + params.bcastAlongRows, + params.useTwoVectors, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -118,23 +162,23 @@ const std::vector> inputsf_i32 = { {0.00001f, 1024, 32, false, false, true, 1234ULL}, {0.00001f, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i32; -TEST_P(MatVecOpTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.00001f, 2500, 250, false, false, false, 1234ULL}, {0.00001f, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestF_i64; -TEST_P(MatVecOpTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols, - CompareApprox(params.tolerance))); +TEST_P(MatVecOpTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, true, true, false, 1234ULL}, @@ -155,23 +199,27 @@ const std::vector> inputsd_i32 = { {0.0000001, 1024, 32, false, false, true, 1234ULL}, {0.0000001, 1024, 64, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i32; -TEST_P(MatVecOpTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols, +TEST_P(MatVecOpTestD_i32, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref.data(), + out.data(), + params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.0000001, 2500, 250, false, false, false, 1234ULL}, {0.0000001, 2500, 250, false, false, true, 1234ULL}}; typedef MatVecOpTest MatVecOpTestD_i64; -TEST_P(MatVecOpTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols, +TEST_P(MatVecOpTestD_i64, Result) +{ + ASSERT_TRUE(devArrMatch(out_ref.data(), + out.data(), + params.rows * params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh index 69c45c9866..5f9c6f1ef3 100644 --- a/cpp/test/linalg/matrix_vector_op.cuh +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -22,9 +22,15 @@ namespace raft { namespace linalg { template -__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, - IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, Type scalar) { +__global__ void naiveMatVecKernel(Type* out, + const Type* mat, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -37,27 +43,37 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec, } else { col = idx / N; } - if (idx < len) { - out[idx] = mat[idx] + scalar * vec[col]; - } + if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; } } template -void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D, - IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) { +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel - <<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel<<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, - const Type *vec2, IdxType D, IdxType N, - bool rowMajor, bool bcastAlongRows, - Type scalar) { +__global__ void naiveMatVecKernel(Type* out, + const Type* mat, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; IdxType col; @@ -70,20 +86,25 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1, } else { col = idx / N; } - if (idx < len) { - out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; - } + if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; } } template -void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2, - IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar) { +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar) +{ static const IdxType TPB = 64; - IdxType len = N * D; - IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel<<>>(out, mat, vec1, vec2, D, N, rowMajor, - bcastAlongRows, scalar); + IdxType len = N * D; + IdxType nblks = raft::ceildiv(len, TPB); + naiveMatVecKernel + <<>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar); CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index f78ae64f05..2a632d55b2 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -32,10 +32,13 @@ class MultiplyTest : public ::testing::TestWithParam> { stream(handle.get_stream()), in(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); int len = params.len; @@ -53,25 +56,23 @@ class MultiplyTest : public ::testing::TestWithParam> { rmm::device_uvector in, out_ref, out; }; -const std::vector> inputsf = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef MultiplyTest MultiplyTestF; -TEST_P(MultiplyTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(MultiplyTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, ::testing::ValuesIn(inputsf)); typedef MultiplyTest MultiplyTestD; -const std::vector> inputsd = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; -TEST_P(MultiplyTestD, Result) { - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +const std::vector> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +TEST_P(MultiplyTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index 659956534e..6dae606f18 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -34,17 +34,19 @@ struct NormInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const NormInputs &I) { - os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " - << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl; +::std::ostream& operator<<(::std::ostream& os, const NormInputs& I) +{ + os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", " + << I.do_sqrt << ", " << I.seed << '}' << std::endl; return os; } ///// Row-wise norm test definitions template -__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, - NormType type, bool do_sqrt) { - Type acc = (Type)0; +__global__ void naiveRowNormKernel( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) +{ + Type acc = (Type)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { @@ -59,12 +61,12 @@ __global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N, } template -void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type, - bool do_sqrt, cudaStream_t stream) { +void naiveRowNorm( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveRowNormKernel - <<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(N, TPB); + naiveRowNormKernel<<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } @@ -76,21 +78,22 @@ class RowNormTest : public ::testing::TestWithParam> { stream(handle.get_stream()), data(params.rows * params.cols, stream), dots_exp(params.rows, stream), - dots_act(params.rows, stream) {} + dots_act(params.rows, stream) + { + } - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols, len = rows * cols; r.uniform(data.data(), len, T(-1.0), T(1.0), stream); - naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type, - params.do_sqrt, stream); + naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - rowNorm(dots_act.data(), data.data(), cols, rows, params.type, - params.rowMajor, stream, fin_op); + rowNorm( + dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream, fin_op); } else { - rowNorm(dots_act.data(), data.data(), cols, rows, params.type, - params.rowMajor, stream); + rowNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -105,10 +108,11 @@ class RowNormTest : public ::testing::TestWithParam> { ///// Column-wise norm test definitisons template -__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, - NormType type, bool do_sqrt) { +__global__ void naiveColNormKernel( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt) +{ int colID = threadIdx.x + blockIdx.x * blockDim.x; - if (colID > D) return; //avoid out-of-bounds thread + if (colID > D) return; // avoid out-of-bounds thread Type acc = 0; for (int i = 0; i < N; i++) { @@ -120,12 +124,12 @@ __global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N, } template -void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type, - bool do_sqrt, cudaStream_t stream) { +void naiveColNorm( + Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(D, TPB); - naiveColNormKernel - <<>>(dots, data, D, N, type, do_sqrt); + int nblks = raft::ceildiv(D, TPB); + naiveColNormKernel<<>>(dots, data, D, N, type, do_sqrt); CUDA_CHECK(cudaPeekAtLastError()); } @@ -137,22 +141,23 @@ class ColNormTest : public ::testing::TestWithParam> { stream(handle.get_stream()), data(params.rows * params.cols, stream), dots_exp(params.cols, stream), - dots_act(params.cols, stream) {} + dots_act(params.cols, stream) + { + } - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols, len = rows * cols; r.uniform(data.data(), len, T(-1.0), T(1.0), stream); - naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type, - params.do_sqrt, stream); + naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream); if (params.do_sqrt) { auto fin_op = [] __device__(T in) { return raft::mySqrt(in); }; - colNorm(dots_act.data(), data.data(), cols, rows, params.type, - params.rowMajor, stream, fin_op); + colNorm( + dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream, fin_op); } else { - colNorm(dots_act.data(), data.data(), cols, rows, params.type, - params.rowMajor, stream); + colNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream); } CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -166,24 +171,23 @@ class ColNormTest : public ::testing::TestWithParam> { }; ///// Row- and column-wise tests -const std::vector> inputsf = { - {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, - - {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, - {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, - {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL}, + + {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL}, + {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL}, + {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}}; const std::vector> inputsd = { {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL}, @@ -205,22 +209,22 @@ const std::vector> inputsd = { {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}}; typedef RowNormTest RowNormTestF; -TEST_P(RowNormTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox(params.tolerance))); } typedef RowNormTest RowNormTestD; -TEST_P(RowNormTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows, - raft::CompareApprox(params.tolerance))); +TEST_P(RowNormTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd)); const std::vector> inputscf = { {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL}, @@ -261,22 +265,22 @@ const std::vector> inputscd = { {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}}; typedef ColNormTest ColNormTestF; -TEST_P(ColNormTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } typedef ColNormTest ColNormTestD; -TEST_P(ColNormTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(ColNormTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, - ::testing::ValuesIn(inputscf)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf)); -INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, - ::testing::ValuesIn(inputscd)); +INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index 9822ca2c60..25ee0a7b77 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -34,8 +34,8 @@ struct ReduceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const ReduceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const ReduceInputs& dims) +{ return os; } @@ -43,44 +43,58 @@ template // for an extended __device__ lambda cannot have private or protected access // within its class template -void reduceLaunch(OutType *dots, const InType *data, int cols, int rows, - bool rowMajor, bool alongRows, bool inplace, - cudaStream_t stream) { - reduce( - dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace, - [] __device__(InType in, int i) { return static_cast(in * in); }); +void reduceLaunch(OutType* dots, + const InType* data, + int cols, + int rows, + bool rowMajor, + bool alongRows, + bool inplace, + cudaStream_t stream) +{ + reduce(dots, + data, + cols, + rows, + (OutType)0, + rowMajor, + alongRows, + stream, + inplace, + [] __device__(InType in, int i) { return static_cast(in * in); }); } template -class ReduceTest - : public ::testing::TestWithParam> { +class ReduceTest : public ::testing::TestWithParam> { public: ReduceTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(params.rows * params.cols, stream), dots_exp(params.alongRows ? params.rows : params.cols, stream), - dots_act(params.alongRows ? params.rows : params.cols, stream) {} + dots_act(params.alongRows ? params.rows : params.cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; - outlen = params.alongRows ? rows : cols; + outlen = params.alongRows ? rows : cols; r.uniform(data.data(), len, InType(-1.0), InType(1.0), stream); - naiveReduction(dots_exp.data(), data.data(), cols, rows, params.rowMajor, - params.alongRows, stream); + naiveReduction( + dots_exp.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, stream); // Perform reduction with default inplace = false first - reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor, - params.alongRows, false, stream); + reduceLaunch( + dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, false, stream); // Add to result with inplace = true next, which shouldn't affect // in the case of coalescedReduction! if (!(params.rowMajor ^ params.alongRows)) { - reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor, - params.alongRows, true, stream); + reduceLaunch( + dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, true, stream); } CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -150,31 +164,31 @@ const std::vector> inputsfd = { {0.000002f, 1024, 256, false, false, 1234ULL}}; typedef ReduceTest ReduceTestFF; -TEST_P(ReduceTestFF, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFF, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestDD; -TEST_P(ReduceTestDD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestDD, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox(params.tolerance))); } typedef ReduceTest ReduceTestFD; -TEST_P(ReduceTestFD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen, - raft::CompareApprox(params.tolerance))); +TEST_P(ReduceTestFD, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, - ::testing::ValuesIn(inputsff)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, - ::testing::ValuesIn(inputsdd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd)); -INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, - ::testing::ValuesIn(inputsfd)); +INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh index 7f8319636b..82ddfd4661 100644 --- a/cpp/test/linalg/reduce.cuh +++ b/cpp/test/linalg/reduce.cuh @@ -26,55 +26,60 @@ namespace raft { namespace linalg { template -__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data, - int D, int N) { - OutType acc = (OutType)0; +__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N) +{ + OutType acc = (OutType)0; int rowStart = threadIdx.x + blockIdx.x * blockDim.x; if (rowStart < N) { for (int i = 0; i < D; ++i) { - acc += - static_cast(data[rowStart * D + i] * data[rowStart * D + i]); + acc += static_cast(data[rowStart * D + i] * data[rowStart * D + i]); } dots[rowStart] = 2 * acc; } } template -void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N, - cudaStream_t stream) { +void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(N, TPB); - naiveCoalescedReductionKernel - <<>>(dots, data, D, N); + int nblks = raft::ceildiv(N, TPB); + naiveCoalescedReductionKernel<<>>(dots, data, D, N); CUDA_CHECK(cudaPeekAtLastError()); } template -void unaryAndGemv(OutType *dots, const InType *data, int D, int N, - cudaStream_t stream) { - //computes a MLCommon unary op on data (squares it), then computes Ax +void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream) +{ + // computes a MLCommon unary op on data (squares it), then computes Ax //(A input matrix and x column vector) to sum columns rmm::device_uvector sq(D * N, stream); raft::linalg::unaryOp( - thrust::raw_pointer_cast(sq.data()), data, D * N, - [] __device__(InType v) { return static_cast(v * v); }, stream); + thrust::raw_pointer_cast(sq.data()), + data, + D * N, + [] __device__(InType v) { return static_cast(v * v); }, + stream); cublasHandle_t handle; CUBLAS_CHECK(cublasCreate(&handle)); - rmm::device_uvector ones(N, stream); //column vector [1...1] + rmm::device_uvector ones(N, stream); // column vector [1...1] raft::linalg::unaryOp( - ones.data(), ones.data(), ones.size(), - [=] __device__(OutType input) { return 1; }, stream); + ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream); OutType alpha = 1, beta = 0; - CUBLAS_CHECK(raft::linalg::cublasgemv(handle, CUBLAS_OP_N, D, N, &alpha, - sq.data(), D, ones.data(), 1, &beta, - dots, 1, stream)); + CUBLAS_CHECK(raft::linalg::cublasgemv( + handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream)); CUDA_CHECK(cudaDeviceSynchronize()); CUBLAS_CHECK(cublasDestroy(handle)); } template -void naiveReduction(OutType *dots, const InType *data, int D, int N, - bool rowMajor, bool alongRows, cudaStream_t stream) { +void naiveReduction(OutType* dots, + const InType* data, + int D, + int N, + bool rowMajor, + bool alongRows, + cudaStream_t stream) +{ if (rowMajor && alongRows) { naiveCoalescedReduction(dots, data, D, N, stream); } else if (rowMajor && !alongRows) { diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index 4f761d39f6..ac387c16bb 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -32,15 +32,14 @@ struct stridedReductionInputs { }; template -void stridedReductionLaunch(T *dots, const T *data, int cols, int rows, - cudaStream_t stream) { - stridedReduction(dots, data, cols, rows, (T)0, stream, false, - [] __device__(T in, int i) { return in * in; }); +void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream) +{ + stridedReduction( + dots, data, cols, rows, (T)0, stream, false, [] __device__(T in, int i) { return in * in; }); } template -class stridedReductionTest - : public ::testing::TestWithParam> { +class stridedReductionTest : public ::testing::TestWithParam> { public: stridedReductionTest() : params(::testing::TestWithParam>::GetParam()), @@ -48,15 +47,17 @@ class stridedReductionTest data(params.rows * params.cols, stream), dots_exp(params.cols, stream), // expected dot products (from test) dots_act(params.cols, stream) // actual dot products (from prim) - {} + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int rows = params.rows, cols = params.cols; int len = rows * cols; r.uniform(data.data(), len, T(-1.0), T(1.0), - stream); //initialize matrix to random + stream); // initialize matrix to random unaryAndGemv(dots_exp.data(), data.data(), cols, rows, stream); stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream); @@ -71,35 +72,33 @@ class stridedReductionTest rmm::device_uvector data, dots_exp, dots_act; }; -const std::vector> inputsf = { - {0.00001f, 1024, 32, 1234ULL}, - {0.00001f, 1024, 64, 1234ULL}, - {0.00001f, 1024, 128, 1234ULL}, - {0.00001f, 1024, 256, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 32, 1234ULL}, + {0.00001f, 1024, 64, 1234ULL}, + {0.00001f, 1024, 128, 1234ULL}, + {0.00001f, 1024, 256, 1234ULL}}; -const std::vector> inputsd = { - {0.000000001, 1024, 32, 1234ULL}, - {0.000000001, 1024, 64, 1234ULL}, - {0.000000001, 1024, 128, 1234ULL}, - {0.000000001, 1024, 256, 1234ULL}}; +const std::vector> inputsd = {{0.000000001, 1024, 32, 1234ULL}, + {0.000000001, 1024, 64, 1234ULL}, + {0.000000001, 1024, 128, 1234ULL}, + {0.000000001, 1024, 256, 1234ULL}}; typedef stridedReductionTest stridedReductionTestF; -TEST_P(stridedReductionTestF, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } typedef stridedReductionTest stridedReductionTestD; -TEST_P(stridedReductionTestD, Result) { - ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(stridedReductionTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, ::testing::ValuesIn(inputsd)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index 0a82da61c9..77c14a8a7b 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -24,39 +24,34 @@ namespace raft { namespace linalg { template -__global__ void naiveSubtractElemKernel(Type *out, const Type *in1, - const Type *in2, int len) { +__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] - in2[idx]; - } + if (idx < len) { out[idx] = in1[idx] - in2[idx]; } } template -void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len, - cudaStream_t stream) { +void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); naiveSubtractElemKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1, - const Type in2, int len) { +__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in1[idx] - in2; - } + if (idx < len) { out[idx] = in1[idx] - in2; } } template -void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len, - cudaStream_t stream) { +void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveSubtractScalarKernel - <<>>(out, in1, in2, len); + int nblks = raft::ceildiv(len, TPB); + naiveSubtractScalarKernel<<>>(out, in1, in2, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -68,7 +63,8 @@ struct SubtractInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SubtractInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SubtractInputs& dims) +{ return os; } @@ -81,10 +77,13 @@ class SubtractTest : public ::testing::TestWithParam> { in1(params.len, stream), in2(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.len; r.uniform(in1.data(), len, T(-1.0), T(1.0), stream); @@ -108,35 +107,33 @@ class SubtractTest : public ::testing::TestWithParam> { rmm::device_uvector in1, in2, out_ref, out; }; -const std::vector> inputsf2 = { - {0.000001f, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}}; typedef SubtractTest SubtractTestF; -TEST_P(SubtractTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); } typedef SubtractTest SubtractTestD; -TEST_P(SubtractTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len, - raft::CompareApprox(params.tolerance))); +TEST_P(SubtractTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), out.data(), params.len, raft::CompareApprox(params.tolerance))); - ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len, - raft::CompareApprox(params.tolerance))); + ASSERT_TRUE(raft::devArrMatch( + out_ref.data(), in1.data(), params.len, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index 8ebbf19683..61c2c2e3db 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -35,7 +35,8 @@ struct SvdInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SvdInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SvdInputs& dims) +{ return os; } @@ -51,10 +52,13 @@ class SvdTest : public ::testing::TestWithParam> { sing_vals_qr(params.n_col, stream), left_eig_vectors_ref(params.n_row * params.n_col, stream), right_eig_vectors_ref(params.n_col * params.n_col, stream), - sing_vals_ref(params.len, stream) {} + sing_vals_ref(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.len; @@ -63,26 +67,30 @@ class SvdTest : public ::testing::TestWithParam> { T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0}; raft::update_device(data.data(), data_h, len, stream); - int left_evl = params.n_row * params.n_col; + int left_evl = params.n_row * params.n_col; int right_evl = params.n_col * params.n_col; - T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, - 0.488195, 0.110706, -0.865685}; + T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, 0.488195, 0.110706, -0.865685}; T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636}; T sing_vals_ref_h[] = {7.065283, 1.040081}; - raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, - left_evl, stream); - raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, - right_evl, stream); - raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, params.n_col, - stream); - - svdQR(handle, data.data(), params.n_row, params.n_col, sing_vals_qr.data(), - left_eig_vectors_qr.data(), right_eig_vectors_trans_qr.data(), true, - true, true, stream); + raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, left_evl, stream); + raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, right_evl, stream); + raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, params.n_col, stream); + + svdQR(handle, + data.data(), + params.n_row, + params.n_col, + sing_vals_qr.data(), + left_eig_vectors_qr.data(), + right_eig_vectors_trans_qr.data(), + true, + true, + true, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -91,71 +99,75 @@ class SvdTest : public ::testing::TestWithParam> { cudaStream_t stream; SvdInputs params; - rmm::device_uvector data, left_eig_vectors_qr, right_eig_vectors_trans_qr, - sing_vals_qr, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref; + rmm::device_uvector data, left_eig_vectors_qr, right_eig_vectors_trans_qr, sing_vals_qr, + left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref; }; -const std::vector> inputsf2 = { - {0.00001f, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsf2 = {{0.00001f, 3 * 2, 3, 2, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00001, 3 * 2, 3, 2, 1234ULL}}; +const std::vector> inputsd2 = {{0.00001, 3 * 2, 3, 2, 1234ULL}}; typedef SvdTest SvdTestValF; -TEST_P(SvdTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(sing_vals_ref.data(), + sing_vals_qr.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestValD; -TEST_P(SvdTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(sing_vals_ref.data(), + sing_vals_qr.data(), + params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecF; -TEST_P(SvdTestLeftVecF, Result) { - ASSERT_TRUE( - raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(), - params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref.data(), + left_eig_vectors_qr.data(), + params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestLeftVecD; -TEST_P(SvdTestLeftVecD, Result) { - ASSERT_TRUE( - raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(), - params.n_row * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestLeftVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref.data(), + left_eig_vectors_qr.data(), + params.n_row * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecF; -TEST_P(SvdTestRightVecF, Result) { - ASSERT_TRUE(raft::devArrMatch( - right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(), - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref.data(), + right_eig_vectors_trans_qr.data(), + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } typedef SvdTest SvdTestRightVecD; -TEST_P(SvdTestRightVecD, Result) { - ASSERT_TRUE(raft::devArrMatch( - right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(), - params.n_col * params.n_col, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(SvdTestRightVecD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref.data(), + right_eig_vectors_trans_qr.data(), + params.n_col * params.n_col, + raft::CompareApproxAbs(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2)); INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2)); // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, // ::testing::ValuesIn(inputsf2)); diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index 1d8ef08673..fde5599bc1 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -34,7 +34,8 @@ struct TranposeInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const TranposeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const TranposeInputs& dims) +{ return os; } @@ -46,10 +47,13 @@ class TransposeTest : public ::testing::TestWithParam> { stream(handle.get_stream()), data(params.len, stream), data_trans_ref(params.len, stream), - data_trans(params.len, stream) {} + data_trans(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { int len = params.len; ASSERT(params.len == 9, "This test works only with len=9!"); T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}; @@ -57,8 +61,7 @@ class TransposeTest : public ::testing::TestWithParam> { T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0}; raft::update_device(data_trans_ref.data(), data_ref_h, len, stream); - transpose(handle, data.data(), data_trans.data(), params.n_row, - params.n_col, stream); + transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream); transpose(data.data(), params.n_row, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -71,39 +74,41 @@ class TransposeTest : public ::testing::TestWithParam> { rmm::device_uvector data, data_trans, data_trans_ref; }; -const std::vector> inputsf2 = { - {0.1f, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}}; -const std::vector> inputsd2 = { - {0.1, 3 * 3, 3, 3, 1234ULL}}; +const std::vector> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}}; typedef TransposeTest TransposeTestValF; -TEST_P(TransposeTestValF, Result) { - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref.data(), data.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(), + data_trans.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(), + data.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } typedef TransposeTest TransposeTestValD; -TEST_P(TransposeTestValD, Result) { - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); - - ASSERT_TRUE( - raft::devArrMatch(data_trans_ref.data(), data.data(), params.len, - raft::CompareApproxAbs(params.tolerance))); +TEST_P(TransposeTestValD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(), + data_trans.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); + + ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(), + data.data(), + params.len, + raft::CompareApproxAbs(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2)); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index 0fcf465150..ff6723973d 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -28,49 +28,49 @@ namespace linalg { // for an extended __device__ lambda cannot have private or protected access // within its class template -void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len, - cudaStream_t stream) { +void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream) +{ if (in == nullptr) { auto op = [scalar] __device__(OutType * ptr, IdxType idx) { *ptr = static_cast(scalar * idx); }; writeOnlyUnaryOp(out, len, op, stream); } else { - auto op = [scalar] __device__(InType in) { - return static_cast(in * scalar); - }; + auto op = [scalar] __device__(InType in) { return static_cast(in * scalar); }; unaryOp(out, in, len, op, stream); } } template -class UnaryOpTest - : public ::testing::TestWithParam> { +class UnaryOpTest : public ::testing::TestWithParam> { public: UnaryOpTest() - : params(::testing::TestWithParam< - UnaryOpInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in(params.len, stream), out_ref(params.len, stream), - out(params.len, stream) {} + out(params.len, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); auto len = params.len; r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - virtual void DoTest() { - auto len = params.len; + virtual void DoTest() + { + auto len = params.len; auto scalar = params.scalar; naiveScale(out_ref.data(), in.data(), scalar, len, stream); unaryOpLaunch(out.data(), in.data(), scalar, len, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_ref.data(), out.data(), params.len, CompareApprox(params.tolerance))); } protected: @@ -85,15 +85,15 @@ class UnaryOpTest template class WriteOnlyUnaryOpTest : public UnaryOpTest { protected: - void DoTest() override { - auto len = this->params.len; + void DoTest() override + { + auto len = this->params.len; auto scalar = this->params.scalar; - naiveScale(this->out_ref.data(), (OutType *)nullptr, scalar, len, - this->stream); - unaryOpLaunch(this->out.data(), (OutType *)nullptr, scalar, len, - this->stream); + naiveScale(this->out_ref.data(), (OutType*)nullptr, scalar, len, this->stream); + unaryOpLaunch(this->out.data(), (OutType*)nullptr, scalar, len, this->stream); CUDA_CHECK(cudaStreamSynchronize(this->stream)); - ASSERT_TRUE(devArrMatch(this->out_ref.data(), this->out.data(), + ASSERT_TRUE(devArrMatch(this->out_ref.data(), + this->out.data(), this->params.len, CompareApprox(this->params.tolerance))); } @@ -103,8 +103,7 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest { TEST_P(Name, Result) { DoTest(); } \ INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs)) -const std::vector> inputsf_i32 = { - {0.000001f, 1024 * 1024, 2.f, 1234ULL}}; +const std::vector> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}}; typedef UnaryOpTest UnaryOpTestF_i32; UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32); typedef WriteOnlyUnaryOpTest WriteOnlyUnaryOpTestF_i32; diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh index be3f1124c5..3343389af8 100644 --- a/cpp/test/linalg/unary_op.cuh +++ b/cpp/test/linalg/unary_op.cuh @@ -24,8 +24,8 @@ namespace raft { namespace linalg { template -__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, - IdxType len) { +__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len) +{ IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x); if (idx < len) { if (in == nullptr) { @@ -38,12 +38,11 @@ __global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar, } template -void naiveScale(OutType *out, const InType *in, InType scalar, int len, - cudaStream_t stream) { +void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); - naiveScaleKernel - <<>>(out, in, scalar, len); + int nblks = raft::ceildiv(len, TPB); + naiveScaleKernel<<>>(out, in, scalar, len); CUDA_CHECK(cudaPeekAtLastError()); } @@ -56,8 +55,8 @@ struct UnaryOpInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const UnaryOpInputs &d) { +::std::ostream& operator<<(::std::ostream& os, const UnaryOpInputs& d) +{ return os; } diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 7c7f29815b..7042f5b48d 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -24,53 +24,51 @@ namespace raft { namespace matrix { template -__global__ void nativePowerKernel(Type *in, Type *out, int len) { +__global__ void nativePowerKernel(Type* in, Type* out, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = in[idx] * in[idx]; - } + if (idx < len) { out[idx] = in[idx] * in[idx]; } } template -void naivePower(Type *in, Type *out, int len, cudaStream_t stream) { +void naivePower(Type* in, Type* out, int len, cudaStream_t stream) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativePowerKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void nativeSqrtKernel(Type *in, Type *out, int len) { +__global__ void nativeSqrtKernel(Type* in, Type* out, int len) +{ int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < len) { - out[idx] = sqrt(in[idx]); - } + if (idx < len) { out[idx] = sqrt(in[idx]); } } template -void naiveSqrt(Type *in, Type *out, int len) { +void naiveSqrt(Type* in, Type* out, int len) +{ static const int TPB = 64; - int nblks = raft::ceildiv(len, TPB); + int nblks = raft::ceildiv(len, TPB); nativeSqrtKernel<<>>(in, out, len); CUDA_CHECK(cudaPeekAtLastError()); } template -__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, - int colCount) { +__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount) +{ int d_i = blockIdx.x * rowCount; int end = d_i + rowCount; if (blockIdx.x < colCount) { - Type max = 0.0; + Type max = 0.0; int max_index = 0; for (int i = d_i; i < end; i++) { Type val = in[i]; - if (val < 0.0) { - val = -val; - } + if (val < 0.0) { val = -val; } if (val > max) { - max = val; + max = val; max_index = i; } } @@ -88,7 +86,8 @@ __global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount, } template -void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) { +void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount) +{ naiveSignFlipKernel<<>>(in, out, rowCount, colCount); CUDA_CHECK(cudaPeekAtLastError()); } @@ -103,7 +102,8 @@ struct MathInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MathInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MathInputs& dims) +{ return os; } @@ -126,12 +126,15 @@ class MathTest : public ::testing::TestWithParam> { out_recip(4, stream), in_smallzero(4, stream), out_smallzero(4, stream), - out_smallzero_ref(4, stream) {} + out_smallzero_ref(4, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { random::Rng r(params.seed); - int len = params.len; + int len = params.len; T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0}; update_device(in_ratio.data(), in_ratio_h, 4, stream); @@ -151,12 +154,11 @@ class MathTest : public ::testing::TestWithParam> { ratio(handle, in_ratio.data(), in_ratio.data(), 4, stream); - naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row, - params.n_col); + naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row, params.n_col); signFlip(in_sign_flip.data(), params.n_row, params.n_col, stream); // default threshold is 1e-15 - std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; + std::vector in_recip_h = {0.1, 0.01, -0.01, 0.1e-16}; std::vector in_recip_ref_h = {10.0, 100.0, -100.0, 0.0}; update_device(in_recip.data(), in_recip_h.data(), 4, stream); update_device(in_recip_ref.data(), in_recip_ref_h.data(), 4, stream); @@ -167,12 +169,11 @@ class MathTest : public ::testing::TestWithParam> { reciprocal(in_recip.data(), recip_scalar, 4, stream, true); - std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; + std::vector in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1}; std::vector in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1}; update_device(in_smallzero.data(), in_small_val_zero_h.data(), 4, stream); - update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4, - stream); + update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4, stream); setSmallValuesZero(out_smallzero.data(), in_smallzero.data(), 4, stream); setSmallValuesZero(in_smallzero.data(), 4, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -183,137 +184,139 @@ class MathTest : public ::testing::TestWithParam> { cudaStream_t stream; MathInputs params; - rmm::device_uvector in_power, out_power_ref, in_sqrt, out_sqrt_ref, - in_ratio, out_ratio_ref, in_sign_flip, out_sign_flip_ref, in_recip, - in_recip_ref, out_recip, in_smallzero, out_smallzero, out_smallzero_ref; + rmm::device_uvector in_power, out_power_ref, in_sqrt, out_sqrt_ref, in_ratio, out_ratio_ref, + in_sign_flip, out_sign_flip_ref, in_recip, in_recip_ref, out_recip, in_smallzero, out_smallzero, + out_smallzero_ref; }; -const std::vector> inputsf = { - {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsf = {{0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}}; -const std::vector> inputsd = { - {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; +const std::vector> inputsd = {{0.00001, 1024, 1024, 1024 * 1024, 1234ULL}}; typedef MathTest MathPowerTestF; -TEST_P(MathPowerTestF, Result) { - ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MathPowerTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + in_power.data(), out_power_ref.data(), params.len, CompareApprox(params.tolerance))); } typedef MathTest MathPowerTestD; -TEST_P(MathPowerTestD, Result) { - ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MathPowerTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + in_power.data(), out_power_ref.data(), params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestF; -TEST_P(MathSqrtTestF, Result) { - ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox(params.tolerance))); } typedef MathTest MathSqrtTestD; -TEST_P(MathSqrtTestD, Result) { - ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len, - CompareApprox(params.tolerance))); +TEST_P(MathSqrtTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestF; -TEST_P(MathRatioTestF, Result) { - ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathRatioTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox(params.tolerance))); } typedef MathTest MathRatioTestD; -TEST_P(MathRatioTestD, Result) { - ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathRatioTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestF; -TEST_P(MathSignFlipTestF, Result) { - ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(), - params.len, CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestF, Result) +{ + ASSERT_TRUE(devArrMatch(in_sign_flip.data(), + out_sign_flip_ref.data(), + params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathSignFlipTestD; -TEST_P(MathSignFlipTestD, Result) { - ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(), - params.len, CompareApprox(params.tolerance))); +TEST_P(MathSignFlipTestD, Result) +{ + ASSERT_TRUE(devArrMatch(in_sign_flip.data(), + out_sign_flip_ref.data(), + params.len, + CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestF; -TEST_P(MathReciprocalTestF, Result) { - ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox(params.tolerance))); } typedef MathTest MathReciprocalTestD; -TEST_P(MathReciprocalTestD, Result) { - ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathReciprocalTestD, Result) +{ + ASSERT_TRUE( + devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox(params.tolerance))); // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`. - ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3, - CompareApprox(params.tolerance))); + ASSERT_TRUE( + devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestF; -TEST_P(MathSetSmallZeroTestF, Result) { - ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox(params.tolerance))); } typedef MathTest MathSetSmallZeroTestD; -TEST_P(MathSetSmallZeroTestD, Result) { - ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4, - CompareApprox(params.tolerance))); +TEST_P(MathSetSmallZeroTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, ::testing::ValuesIn(inputsd)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, - ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, ::testing::ValuesIn(inputsd)); } // namespace matrix } // namespace raft diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index e247abad1e..6f052f7b46 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -33,7 +33,8 @@ struct MatrixInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MatrixInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MatrixInputs& dims) +{ return os; } @@ -45,10 +46,13 @@ class MatrixTest : public ::testing::TestWithParam> { stream(handle.get_stream()), in1(params.n_row * params.n_col, stream), in2(params.n_row * params.n_col, stream), - in1_revr(params.n_row * params.n_col, stream) {} + in1_revr(params.n_row * params.n_col, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = params.n_row * params.n_col; r.uniform(in1.data(), len, T(-1.0), T(1.0), stream); @@ -72,87 +76,84 @@ class MatrixTest : public ::testing::TestWithParam> { const std::vector> inputsf2 = {{0.000001f, 4, 4, 1234ULL}}; -const std::vector> inputsd2 = { - {0.00000001, 4, 4, 1234ULL}}; +const std::vector> inputsd2 = {{0.00000001, 4, 4, 1234ULL}}; typedef MatrixTest MatrixTestF; -TEST_P(MatrixTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(), +TEST_P(MatrixTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch(in1.data(), + in2.data(), params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); } typedef MatrixTest MatrixTestD; -TEST_P(MatrixTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(), +TEST_P(MatrixTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(in1.data(), + in2.data(), params.n_row * params.n_col, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, - ::testing::ValuesIn(inputsf2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2)); -INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, - ::testing::ValuesIn(inputsd2)); +INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, ::testing::ValuesIn(inputsd2)); template class MatrixCopyRowsTest : public ::testing::Test { - using math_t = typename std::tuple_element<0, T>::type; - using idx_t = typename std::tuple_element<1, T>::type; + using math_t = typename std::tuple_element<0, T>::type; + using idx_t = typename std::tuple_element<1, T>::type; using idx_array_t = typename std::tuple_element<2, T>::type; protected: MatrixCopyRowsTest() : input(n_cols * n_rows, handle.get_stream()), indices(n_selected, handle.get_stream()), - output(n_cols * n_selected, handle.get_stream()) { + output(n_cols * n_selected, handle.get_stream()) + { CUDA_CHECK(cudaStreamCreate(&stream)); handle.set_stream(stream); raft::update_device(indices.data(), indices_host, n_selected, stream); // Init input array thrust::counting_iterator first(0); thrust::device_ptr ptr(input.data()); - thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, - ptr); + thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr); } void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); } - void testCopyRows() { - copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), - n_selected, stream, false); - EXPECT_TRUE(raft::devArrMatchHost(output_exp_colmajor, output.data(), - n_selected * n_cols, - raft::Compare())); - copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), - n_selected, stream, true); - EXPECT_TRUE(raft::devArrMatchHost(output_exp_rowmajor, output.data(), - n_selected * n_cols, - raft::Compare())); + void testCopyRows() + { + copyRows( + input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false); + EXPECT_TRUE(raft::devArrMatchHost( + output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare())); + copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true); + EXPECT_TRUE(raft::devArrMatchHost( + output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare())); } protected: raft::handle_t handle; cudaStream_t stream; - int n_rows = 10; - int n_cols = 3; + int n_rows = 10; + int n_cols = 3; int n_selected = 5; - idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; - math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, - 17, 19, 20, 23, 24, 27, 29}; - math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, - 14, 21, 22, 23, 27, 28, 29}; + idx_array_t indices_host[5] = {0, 3, 4, 7, 9}; + math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, 17, 19, 20, 23, 24, 27, 29}; + math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, 14, 21, 22, 23, 27, 28, 29}; rmm::device_uvector input; rmm::device_uvector output; rmm::device_uvector indices; }; -using TypeTuple = - ::testing::Types, std::tuple, - std::tuple, - std::tuple>; +using TypeTuple = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; TYPED_TEST_CASE(MatrixCopyRowsTest, TypeTuple); TYPED_TEST(MatrixCopyRowsTest, CopyRows) { this->testCopyRows(); } diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp index fe42cea8b3..5cfcc910fd 100644 --- a/cpp/test/mr/device/buffer.cpp +++ b/cpp/test/mr/device/buffer.cpp @@ -25,7 +25,8 @@ namespace raft { namespace mr { namespace device { -TEST(Raft, DeviceBufferAlloc) { +TEST(Raft, DeviceBufferAlloc) +{ cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); // no allocation at construction @@ -51,13 +52,14 @@ TEST(Raft, DeviceBufferAlloc) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceBufferZeroResize) { +TEST(Raft, DeviceBufferZeroResize) +{ // Create a limiting_resource_adaptor to track allocations - auto curr_mr = dynamic_cast( - rmm::mr::get_current_device_resource()); - auto limit_mr = std::make_shared< - rmm::mr::limiting_resource_adaptor>(curr_mr, - 1000); + auto curr_mr = + dynamic_cast(rmm::mr::get_current_device_resource()); + auto limit_mr = + std::make_shared>(curr_mr, + 1000); rmm::mr::set_current_device_resource(limit_mr.get()); diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp index 953f65ddfb..aadf05285c 100644 --- a/cpp/test/mr/host/buffer.cpp +++ b/cpp/test/mr/host/buffer.cpp @@ -24,7 +24,8 @@ namespace raft { namespace mr { namespace host { -TEST(Raft, HostBuffer) { +TEST(Raft, HostBuffer) +{ auto alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -51,14 +52,14 @@ TEST(Raft, HostBuffer) { CUDA_CHECK(cudaStreamDestroy(stream)); } -TEST(Raft, DeviceToHostBuffer) { +TEST(Raft, DeviceToHostBuffer) +{ auto d_alloc = std::make_shared(); auto h_alloc = std::make_shared(); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); device::buffer d_buff(d_alloc, stream, 32); - CUDA_CHECK( - cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); + CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream)); buffer h_buff(h_alloc, d_buff); ASSERT_EQ(d_buff.size(), h_buff.size()); CUDA_CHECK(cudaStreamSynchronize(stream)); diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu index 781e6d1d3f..90a6d7bd87 100644 --- a/cpp/test/mst.cu +++ b/cpp/test/mst.cu @@ -61,7 +61,8 @@ namespace mst { // Sequential prims function // Returns total weight of MST template -weight_t prims(CSRHost &csr_h) { +weight_t prims(CSRHost& csr_h) +{ std::size_t n_vertices = csr_h.offsets.size() - 1; bool active_vertex[n_vertices]; @@ -70,19 +71,18 @@ weight_t prims(CSRHost &csr_h) { for (std::size_t i = 0; i < n_vertices; i++) { active_vertex[i] = false; - curr_edge[i] = static_cast(std::numeric_limits::max()); + curr_edge[i] = static_cast(std::numeric_limits::max()); } curr_edge[0] = 0; // function to pick next min vertex-edge - auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex, - auto n_vertices) { + auto min_vertex_edge = [](auto* curr_edge, auto* active_vertex, auto n_vertices) { auto min = static_cast(std::numeric_limits::max()); vertex_t min_vertex{}; for (std::size_t v = 0; v < n_vertices; v++) { if (!active_vertex[v] && curr_edge[v] < min) { - min = curr_edge[v]; + min = curr_edge[v]; min_vertex = v; } } @@ -98,14 +98,13 @@ weight_t prims(CSRHost &csr_h) { active_vertex[curr_v] = true; // set to active // iterate through edges of current active vertex - auto edge_st = csr_h.offsets[curr_v]; + auto edge_st = csr_h.offsets[curr_v]; auto edge_end = csr_h.offsets[curr_v + 1]; for (auto e = edge_st; e < edge_end; e++) { // put edges to be considered for next iteration auto neighbor_idx = csr_h.indices[e]; - if (!active_vertex[neighbor_idx] && - csr_h.weights[e] < curr_edge[neighbor_idx]) { + if (!active_vertex[neighbor_idx] && csr_h.weights[e] < curr_edge[neighbor_idx]) { curr_edge[neighbor_idx] = csr_h.weights[e]; } } @@ -121,15 +120,15 @@ weight_t prims(CSRHost &csr_h) { } template -class MSTTest - : public ::testing::TestWithParam> { +class MSTTest : public ::testing::TestWithParam> { protected: std::pair, raft::Graph_COO> - mst_gpu() { - edge_t *offsets = static_cast(csr_d.offsets.data()); - vertex_t *indices = static_cast(csr_d.indices.data()); - weight_t *weights = static_cast(csr_d.weights.data()); + mst_gpu() + { + edge_t* offsets = static_cast(csr_d.offsets.data()); + vertex_t* indices = static_cast(csr_d.indices.data()); + weight_t* weights = static_cast(csr_d.weights.data()); v = static_cast((csr_d.offsets.size() / sizeof(vertex_t)) - 1); e = static_cast(csr_d.indices.size() / sizeof(edge_t)); @@ -138,89 +137,95 @@ class MSTTest rmm::device_uvector mst_dst(2 * v - 2, handle.get_stream()); rmm::device_uvector color(v, handle.get_stream()); - CUDA_CHECK( - cudaMemsetAsync(mst_src.data(), std::numeric_limits::max(), - mst_src.size() * sizeof(vertex_t), handle.get_stream())); - CUDA_CHECK( - cudaMemsetAsync(mst_dst.data(), std::numeric_limits::max(), - mst_dst.size() * sizeof(vertex_t), handle.get_stream())); - CUDA_CHECK(cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), + CUDA_CHECK(cudaMemsetAsync(mst_src.data(), + std::numeric_limits::max(), + mst_src.size() * sizeof(vertex_t), + handle.get_stream())); + CUDA_CHECK(cudaMemsetAsync(mst_dst.data(), + std::numeric_limits::max(), + mst_dst.size() * sizeof(vertex_t), handle.get_stream())); + CUDA_CHECK( + cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), handle.get_stream())); - vertex_t *color_ptr = thrust::raw_pointer_cast(color.data()); + vertex_t* color_ptr = thrust::raw_pointer_cast(color.data()); if (iterations == 0) { MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, true, 0); auto symmetric_result = symmetric_solver.solve(); MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), - std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); } else { - MST_solver intermediate_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, true, iterations); + MST_solver intermediate_solver(handle, + offsets, + indices, + weights, + v, + e, + color_ptr, + handle.get_stream(), + true, + true, + iterations); auto intermediate_result = intermediate_solver.solve(); MST_solver symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - true, false, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, false, 0); auto symmetric_result = symmetric_solver.solve(); // symmetric_result.n_edges += intermediate_result.n_edges; - auto total_edge_size = - symmetric_result.n_edges + intermediate_result.n_edges; + auto total_edge_size = symmetric_result.n_edges + intermediate_result.n_edges; symmetric_result.src.resize(total_edge_size, handle.get_stream()); symmetric_result.dst.resize(total_edge_size, handle.get_stream()); symmetric_result.weights.resize(total_edge_size, handle.get_stream()); raft::copy(symmetric_result.src.data() + symmetric_result.n_edges, - intermediate_result.src.data(), intermediate_result.n_edges, + intermediate_result.src.data(), + intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.dst.data() + symmetric_result.n_edges, - intermediate_result.dst.data(), intermediate_result.n_edges, + intermediate_result.dst.data(), + intermediate_result.n_edges, handle.get_stream()); raft::copy(symmetric_result.weights.data() + symmetric_result.n_edges, intermediate_result.weights.data(), - intermediate_result.n_edges, handle.get_stream()); + intermediate_result.n_edges, + handle.get_stream()); symmetric_result.n_edges = total_edge_size; MST_solver non_symmetric_solver( - handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), - false, true, 0); + handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0); auto non_symmetric_result = non_symmetric_solver.solve(); EXPECT_LE(symmetric_result.n_edges, 2 * v - 2); EXPECT_LE(non_symmetric_result.n_edges, v - 1); - return std::make_pair(std::move(symmetric_result), - std::move(non_symmetric_result)); + return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result)); } } - void SetUp() override { - mst_input = ::testing::TestWithParam< - MSTTestInput>::GetParam(); + void SetUp() override + { + mst_input = ::testing::TestWithParam>::GetParam(); iterations = mst_input.iterations; - csr_d.offsets = rmm::device_buffer( - mst_input.csr_h.offsets.data(), - mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream()); - csr_d.indices = rmm::device_buffer( - mst_input.csr_h.indices.data(), - mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream()); - csr_d.weights = rmm::device_buffer( - mst_input.csr_h.weights.data(), - mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream()); + csr_d.offsets = rmm::device_buffer(mst_input.csr_h.offsets.data(), + mst_input.csr_h.offsets.size() * sizeof(edge_t), + handle.get_stream()); + csr_d.indices = rmm::device_buffer(mst_input.csr_h.indices.data(), + mst_input.csr_h.indices.size() * sizeof(vertex_t), + handle.get_stream()); + csr_d.weights = rmm::device_buffer(mst_input.csr_h.weights.data(), + mst_input.csr_h.weights.size() * sizeof(weight_t), + handle.get_stream()); } void TearDown() override {} @@ -272,41 +277,68 @@ const std::vector> csr_in_h = { const std::vector> csr_in4_h = { {{0, 3, 5, 8, 10, 12, 14, 16}, {2, 4, 5, 3, 6, 0, 4, 5, 1, 6, 0, 2, 0, 2, 1, 3}, - {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, - 1.0f, 6.0f, 7.0f, 10.0f}}}; + {5.0f, + 9.0f, + 1.0f, + 8.0f, + 7.0f, + 5.0f, + 2.0f, + 6.0f, + 8.0f, + 10.0f, + 9.0f, + 2.0f, + 1.0f, + 6.0f, + 7.0f, + 10.0f}}}; // singletons const std::vector> csr_in5_h = { {{0, 3, 5, 8, 10, 10, 10, 12, 14, 16, 16}, {2, 8, 7, 3, 8, 0, 8, 7, 1, 8, 0, 2, 0, 2, 1, 3}, - {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f, - 1.0f, 6.0f, 7.0f, 10.0f}}}; + {5.0f, + 9.0f, + 1.0f, + 8.0f, + 7.0f, + 5.0f, + 2.0f, + 6.0f, + 8.0f, + 10.0f, + 9.0f, + 2.0f, + 1.0f, + 6.0f, + 7.0f, + 10.0f}}}; typedef MSTTest MSTTestSequential; -TEST_P(MSTTestSequential, Sequential) { - auto results_pair = mst_gpu(); - auto &symmetric_result = results_pair.first; - auto &non_symmetric_result = results_pair.second; +TEST_P(MSTTestSequential, Sequential) +{ + auto results_pair = mst_gpu(); + auto& symmetric_result = results_pair.first; + auto& non_symmetric_result = results_pair.second; // do assertions here // in this case, running sequential MST auto prims_result = prims(mst_input.csr_h); - auto symmetric_sum = - thrust::reduce(thrust::device, symmetric_result.weights.data(), - symmetric_result.weights.data() + symmetric_result.n_edges); - auto non_symmetric_sum = thrust::reduce( - thrust::device, non_symmetric_result.weights.data(), - non_symmetric_result.weights.data() + non_symmetric_result.n_edges); - - ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, - raft::CompareApprox(0.1))); - ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, - raft::CompareApprox(0.1))); + auto symmetric_sum = thrust::reduce(thrust::device, + symmetric_result.weights.data(), + symmetric_result.weights.data() + symmetric_result.n_edges); + auto non_symmetric_sum = + thrust::reduce(thrust::device, + non_symmetric_result.weights.data(), + non_symmetric_result.weights.data() + non_symmetric_result.n_edges); + + ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, raft::CompareApprox(0.1))); + ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, raft::CompareApprox(0.1))); } -INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, - ::testing::ValuesIn(csr_in_h)); +INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, ::testing::ValuesIn(csr_in_h)); } // namespace mst } // namespace raft diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu index 92976e5c61..c76064ade7 100644 --- a/cpp/test/pow2_utils.cu +++ b/cpp/test/pow2_utils.cu @@ -24,7 +24,8 @@ struct Pow2Test : public ::testing::Test { typedef Pow2 P; std::vector data; - void SetUp() override { + void SetUp() override + { std::vector pos = {0, 1, 2, 7, 15, 16, 17, 31, 35, 1024, 1623}; data.insert(data.end(), pos.begin(), pos.end()); if constexpr (std::is_signed::value) { @@ -35,7 +36,8 @@ struct Pow2Test : public ::testing::Test { data.push_back(std::numeric_limits::max()); } - void quotRem() { + void quotRem() + { for (auto x : data) { ASSERT_EQ(P::quot(x), x / P::Value) << " where x = " << x; ASSERT_EQ(P::rem(x), x % P::Value) << " where x = " << x; @@ -43,31 +45,32 @@ struct Pow2Test : public ::testing::Test { } } - void divMod() { + void divMod() + { for (auto x : data) { ASSERT_GE(P::mod(x), 0) << " where x = " << x; ASSERT_EQ(x, P::div(x) * P::Value + P::mod(x)); } } - void round() { + void round() + { for (auto x : data) { - if (x <= std::numeric_limits::max() - TargetT(P::Value)) - ASSERT_GE(P::roundUp(x), x); + if (x <= std::numeric_limits::max() - TargetT(P::Value)) ASSERT_GE(P::roundUp(x), x); if (x >= std::numeric_limits::min() + TargetT(P::Value)) ASSERT_LE(P::roundDown(x), x); ASSERT_EQ(x - P::roundDown(x), P::mod(x)) << " where x = " << x; - ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0) - << " where x = " << x; + ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0) << " where x = " << x; } } - void alignment() { + void alignment() + { for (auto x : data) { ASSERT_TRUE(P::areSameAlignOffsets(x, x)); if (x <= std::numeric_limits::max() - TargetT(P::Value)) { ASSERT_TRUE(P::areSameAlignOffsets(x, x + TargetT(P::Value))); - int aligned_count = 0; + int aligned_count = 0; int same_aligned_count = 0; for (int i = 0; i < int(P::Value); i++) { aligned_count += P::isAligned(x + i); @@ -97,10 +100,11 @@ TEST_IT(Pow2_u64_i32_128); TEST_IT(Pow2_ll_u16_32); TEST_IT(Pow2_i32_u64_16); -TEST(Pow2, pointers) { +TEST(Pow2, pointers) +{ typedef Pow2<32UL> P; for (ptrdiff_t i = 0; i <= ptrdiff_t(P::Value); i++) { - auto *p = reinterpret_cast(16345 + i); + auto* p = reinterpret_cast(16345 + i); ASSERT_GE(P::roundUp(p), p); ASSERT_LE(P::roundDown(p), p); } diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index 810d6cb871..69dc146486 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -40,12 +40,13 @@ enum RandomType { }; template -__global__ void meanKernel(T* out, const T* data, int len) { +__global__ void meanKernel(T* out, const T* data, int len) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int tid = threadIdx.x + blockIdx.x * blockDim.x; - T val = tid < len ? data[tid] : T(0); - T x = BlockReduce(temp_storage).Sum(val); + T val = tid < len ? data[tid] : T(0); + T x = BlockReduce(temp_storage).Sum(val); __syncthreads(); T xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -72,7 +73,8 @@ struct RngInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) +{ return os; } @@ -86,47 +88,36 @@ class RngTest : public ::testing::TestWithParam> { : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(0, stream), - stats(2, stream) { + stats(2, stream) + { data.resize(params.len, stream); CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream)); } protected: - void SetUp() override { + void SetUp() override + { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; Rng r(params.seed, params.gtype); switch (params.type) { - case RNG_Normal: - r.normal(data.data(), params.len, params.start, params.end, stream); - break; + case RNG_Normal: r.normal(data.data(), params.len, params.start, params.end, stream); break; case RNG_LogNormal: r.lognormal(data.data(), params.len, params.start, params.end, stream); break; - case RNG_Uniform: - r.uniform(data.data(), params.len, params.start, params.end, stream); - break; - case RNG_Gumbel: - r.gumbel(data.data(), params.len, params.start, params.end, stream); - break; + case RNG_Uniform: r.uniform(data.data(), params.len, params.start, params.end, stream); break; + case RNG_Gumbel: r.gumbel(data.data(), params.len, params.start, params.end, stream); break; case RNG_Logistic: r.logistic(data.data(), params.len, params.start, params.end, stream); break; - case RNG_Exp: - r.exponential(data.data(), params.len, params.start, stream); - break; - case RNG_Rayleigh: - r.rayleigh(data.data(), params.len, params.start, stream); - break; - case RNG_Laplace: - r.laplace(data.data(), params.len, params.start, params.end, stream); - break; + case RNG_Exp: r.exponential(data.data(), params.len, params.start, stream); break; + case RNG_Rayleigh: r.rayleigh(data.data(), params.len, params.start, stream); break; + case RNG_Laplace: r.laplace(data.data(), params.len, params.start, params.end, stream); break; }; static const int threads = 128; - meanKernel - <<>>( - stats.data(), data.data(), params.len); + meanKernel<<>>( + stats.data(), data.data(), params.len); update_host(h_stats, stats.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -134,18 +125,18 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); } - void getExpectedMeanVar(T meanvar[2]) { + void getExpectedMeanVar(T meanvar[2]) + { switch (params.type) { case RNG_Normal: meanvar[0] = params.start; meanvar[1] = params.end * params.end; break; case RNG_LogNormal: { - auto var = params.end * params.end; - auto mu = params.start; + auto var = params.end * params.end; + auto mu = params.start; meanvar[0] = raft::myExp(mu + var * T(0.5)); - meanvar[1] = - (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); + meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var); break; } case RNG_Uniform: @@ -169,8 +160,7 @@ class RngTest : public ::testing::TestWithParam> { break; case RNG_Rayleigh: meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0)); - meanvar[1] = - ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; + meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start; break; case RNG_Laplace: meanvar[0] = params.start; @@ -264,13 +254,12 @@ const std::vector> inputsf = { {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestF, Result) { +TEST_P(RngTestF, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf)); @@ -326,13 +315,12 @@ const std::vector> inputsd = { {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL}, {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}, {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}}; -TEST_P(RngTestD, Result) { +TEST_P(RngTestD, Result) +{ double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); @@ -340,7 +328,8 @@ INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); // Test for expected variance in mean calculations template -T quick_mean(const std::vector& d) { +T quick_mean(const std::vector& d) +{ T acc = T(0); for (const auto& di : d) { acc += di; @@ -349,8 +338,9 @@ T quick_mean(const std::vector& d) { } template -T quick_std(const std::vector& d) { - T acc = T(0); +T quick_std(const std::vector& d) +{ + T acc = T(0); T d_mean = quick_mean(d); for (const auto& di : d) { acc += ((di - d_mean) * (di - d_mean)); @@ -359,7 +349,8 @@ T quick_std(const std::vector& d) { } template -std::ostream& operator<<(std::ostream& out, const std::vector& v) { +std::ostream& operator<<(std::ostream& out, const std::vector& v) +{ if (!v.empty()) { out << '['; std::copy(v.begin(), v.end(), std::ostream_iterator(out, ", ")); @@ -374,13 +365,14 @@ std::ostream& operator<<(std::ostream& out, const std::vector& v) { // experiments computing the mean, giving us a distribution of the mean // itself. The mean error is simply the standard deviation of this // distribution (the standard deviation of the mean). -TEST(Rng, MeanError) { +TEST(Rng, MeanError) +{ timeb time_struct; ftime(&time_struct); - int seed = time_struct.millitm; - int num_samples = 1024; + int seed = time_struct.millitm; + int num_samples = 1024; int num_experiments = 1024; - int len = num_samples * num_experiments; + int len = num_samples * num_experiments; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); @@ -393,22 +385,26 @@ TEST(Rng, MeanError) { Rng r(seed, rtype); r.normal(data.data(), len, 3.3f, 0.23f, stream); // r.uniform(data, len, -1.0, 2.0); - raft::stats::mean(mean_result.data(), data.data(), num_samples, - num_experiments, false, false, stream); - raft::stats::stddev(std_result.data(), data.data(), mean_result.data(), - num_samples, num_experiments, false, false, stream); + raft::stats::mean( + mean_result.data(), data.data(), num_samples, num_experiments, false, false, stream); + raft::stats::stddev(std_result.data(), + data.data(), + mean_result.data(), + num_samples, + num_experiments, + false, + false, + stream); std::vector h_mean_result(num_experiments); std::vector h_std_result(num_experiments); - update_host(h_mean_result.data(), mean_result.data(), num_experiments, - stream); - update_host(h_std_result.data(), std_result.data(), num_experiments, - stream); + update_host(h_mean_result.data(), mean_result.data(), num_experiments, stream); + update_host(h_std_result.data(), std_result.data(), num_experiments, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); auto d_mean = quick_mean(h_mean_result); // std-dev of mean; also known as mean error - auto d_std_of_mean = quick_std(h_mean_result); - auto d_std = quick_mean(h_std_result); + auto d_std_of_mean = quick_std(h_mean_result); + auto d_std = quick_mean(h_std_result); auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples); // std::cout << "measured mean error: " << d_std_of_mean << "\n"; @@ -417,8 +413,7 @@ TEST(Rng, MeanError) { auto diff_expected_vs_measured_mean_error = std::abs(d_std_of_mean - d_std / std::sqrt(num_samples)); - ASSERT_TRUE( - (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); + ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5)); } CUDA_CHECK(cudaStreamDestroy(stream)); @@ -431,18 +426,19 @@ class ScaledBernoulliTest : public ::testing::Test { ScaledBernoulliTest() : stream(handle.get_stream()), data(len, stream) {} protected: - void SetUp() override { + void SetUp() override + { CUDA_CHECK(cudaStreamCreate(&stream)); Rng r(42); r.scaled_bernoulli(data.data(), len, T(0.5), T(scale), stream); } - void rangeCheck() { + void rangeCheck() + { T* h_data = new T[len]; update_host(h_data, data.data(), len, stream); - ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) { - return a < -scale || a > scale; - })); + ASSERT_TRUE( + std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; })); delete[] h_data; } @@ -464,13 +460,15 @@ class BernoulliTest : public ::testing::Test { BernoulliTest() : stream(handle.get_stream()), data(len, stream) {} protected: - void SetUp() override { + void SetUp() override + { Rng r(42); r.bernoulli(data.data(), len, T(0.5), stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void trueFalseCheck() { + void trueFalseCheck() + { // both true and false values must be present bool* h_data = new bool[len]; update_host(h_data, data.data(), len, stream); @@ -502,38 +500,39 @@ struct RngNormalTableInputs { }; template -::std::ostream& operator<<(::std::ostream& os, - const RngNormalTableInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const RngNormalTableInputs& dims) +{ return os; } template -class RngNormalTableTest - : public ::testing::TestWithParam> { +class RngNormalTableTest : public ::testing::TestWithParam> { public: RngNormalTableTest() : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(params.rows * params.cols, stream), stats(2, stream), - mu_vec(params.cols, stream) { + mu_vec(params.cols, stream) + { CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream)); } protected: - void SetUp() override { + void SetUp() override + { // Tests are configured with their expected test-values sigma. For example, // 4 x sigma indicates the test shouldn't fail 99.9% of the time. num_sigma = 10; - int len = params.rows * params.cols; + int len = params.rows * params.cols; Rng r(params.seed, params.gtype); r.fill(mu_vec.data(), params.cols, params.mu, stream); T* sigma_vec = nullptr; - r.normalTable(data.data(), params.rows, params.cols, mu_vec.data(), - sigma_vec, params.sigma, stream); + r.normalTable( + data.data(), params.rows, params.cols, mu_vec.data(), sigma_vec, params.sigma, stream); static const int threads = 128; - meanKernel<<>>( - stats.data(), data.data(), len); + meanKernel + <<>>(stats.data(), data.data(), len); update_host(h_stats, stats.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= len; @@ -541,7 +540,8 @@ class RngNormalTableTest CUDA_CHECK(cudaStreamSynchronize(stream)); } - void getExpectedMeanVar(T meanvar[2]) { + void getExpectedMeanVar(T meanvar[2]) + { meanvar[0] = params.mu; meanvar[1] = params.sigma * params.sigma; } @@ -565,16 +565,14 @@ const std::vector> inputsf_t = { {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestF, Result) { +TEST_P(RngNormalTableTestF, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, - ::testing::ValuesIn(inputsf_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, ::testing::ValuesIn(inputsf_t)); typedef RngNormalTableTest RngNormalTableTestD; const std::vector> inputsd_t = { @@ -584,16 +582,14 @@ const std::vector> inputsd_t = { {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL}, {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL}, {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}}; -TEST_P(RngNormalTableTestD, Result) { +TEST_P(RngNormalTableTestD, Result) +{ double meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE(match(meanvar[0], h_stats[0], - CompareApprox(num_sigma * params.tolerance))); - ASSERT_TRUE(match(meanvar[1], h_stats[1], - CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(num_sigma * params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(num_sigma * params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, - ::testing::ValuesIn(inputsd_t)); +INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, ::testing::ValuesIn(inputsd_t)); struct RngAffineInputs { int n; @@ -602,13 +598,15 @@ struct RngAffineInputs { class RngAffineTest : public ::testing::TestWithParam { protected: - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam::GetParam(); Rng r(params.seed); r.affine_transform_params(params.n, a, b); } - void check() { + void check() + { ASSERT_TRUE(gcd(a, params.n) == 1); ASSERT_TRUE(0 <= b && b < params.n); } @@ -619,13 +617,17 @@ class RngAffineTest : public ::testing::TestWithParam { }; // RngAffineTest const std::vector inputs_affine = { - {100, 123456ULL}, {100, 1234567890ULL}, {101, 123456ULL}, - {101, 1234567890ULL}, {7, 123456ULL}, {7, 1234567890ULL}, - {2568, 123456ULL}, {2568, 1234567890ULL}, + {100, 123456ULL}, + {100, 1234567890ULL}, + {101, 123456ULL}, + {101, 1234567890ULL}, + {7, 123456ULL}, + {7, 1234567890ULL}, + {2568, 123456ULL}, + {2568, 1234567890ULL}, }; TEST_P(RngAffineTest, Result) { check(); } -INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, - ::testing::ValuesIn(inputs_affine)); +INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, ::testing::ValuesIn(inputs_affine)); } // namespace random } // namespace raft diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index cef2d47276..f0331b7746 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -29,12 +29,13 @@ using namespace raft::random::detail; enum RandomType { RNG_Uniform }; template -__global__ void meanKernel(float *out, const T *data, int len) { +__global__ void meanKernel(float* out, const T* data, int len) +{ typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - int tid = threadIdx.x + blockIdx.x * blockDim.x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; float val = tid < len ? data[tid] : T(0); - float x = BlockReduce(temp_storage).Sum(val); + float x = BlockReduce(temp_storage).Sum(val); __syncthreads(); float xx = BlockReduce(temp_storage).Sum(val * val); __syncthreads(); @@ -61,7 +62,8 @@ struct RngInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const RngInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const RngInputs& dims) +{ return os; } @@ -72,13 +74,15 @@ class RngTest : public ::testing::TestWithParam> { : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), data(0, stream), - stats(2, stream) { + stats(2, stream) + { data.resize(params.len, stream); CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(float), stream)); } protected: - void SetUp() override { + void SetUp() override + { Rng r(params.seed, params.gtype); switch (params.type) { @@ -87,9 +91,8 @@ class RngTest : public ::testing::TestWithParam> { break; }; static const int threads = 128; - meanKernel - <<>>( - stats.data(), data.data(), params.len); + meanKernel<<>>( + stats.data(), data.data(), params.len); update_host(h_stats, stats.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); h_stats[0] /= params.len; @@ -97,7 +100,8 @@ class RngTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); } - void getExpectedMeanVar(float meanvar[2]) { + void getExpectedMeanVar(float meanvar[2]) + { switch (params.type) { case RNG_Uniform: meanvar[0] = (params.start + params.end) * 0.5f; @@ -125,13 +129,12 @@ const std::vector> inputs_u32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU32, Result) { +TEST_P(RngTestU32, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32)); @@ -143,13 +146,12 @@ const std::vector> inputs_u64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestU64, Result) { +TEST_P(RngTestU64, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64)); @@ -161,13 +163,12 @@ const std::vector> inputs_s32 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS32, Result) { +TEST_P(RngTestS32, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32)); @@ -179,13 +180,12 @@ const std::vector> inputs_s64 = { {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL}, {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}, {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}}; -TEST_P(RngTestS64, Result) { +TEST_P(RngTestS64, Result) +{ float meanvar[2]; getExpectedMeanVar(meanvar); - ASSERT_TRUE( - match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); - ASSERT_TRUE( - match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox(params.tolerance))); + ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64)); diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index 1d33f08c62..a681bbb07d 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -40,7 +40,8 @@ struct SWoRInputs { }; template -::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const SWoRInputs& dims) +{ return os; } @@ -53,20 +54,27 @@ class SWoRTest : public ::testing::TestWithParam> { in(params.len, stream), wts(params.len, stream), out(params.sampledLen, stream), - outIdx(params.sampledLen, stream) {} + outIdx(params.sampledLen, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { Rng r(params.seed, params.gtype); h_outIdx.resize(params.sampledLen); r.uniform(in.data(), params.len, T(-1.0), T(1.0), stream); r.uniform(wts.data(), params.len, T(1.0), T(2.0), stream); if (params.largeWeightIndex >= 0) { - update_device(wts.data() + params.largeWeightIndex, ¶ms.largeWeight, - 1, stream); + update_device(wts.data() + params.largeWeightIndex, ¶ms.largeWeight, 1, stream); } - r.sampleWithoutReplacement(handle, out.data(), outIdx.data(), in.data(), - wts.data(), params.sampledLen, params.len, + r.sampleWithoutReplacement(handle, + out.data(), + outIdx.data(), + in.data(), + wts.data(), + params.sampledLen, + params.len, stream); update_host(&(h_outIdx[0]), outIdx.data(), params.sampledLen, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -145,14 +153,14 @@ const std::vector> inputsf = { {1024, 512, 10, 100000.f, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestF, Result) { +TEST_P(SWoRTestF, Result) +{ std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val - << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -160,9 +168,7 @@ TEST_P(SWoRTestF, Result) { } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { - ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); - } + if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf)); @@ -229,14 +235,14 @@ const std::vector> inputsd = { {1024, 512, 10, 100000.0, GenKiss99, 1234ULL}, }; -TEST_P(SWoRTestD, Result) { +TEST_P(SWoRTestD, Result) +{ std::set occurence; for (int i = 0; i < params.sampledLen; ++i) { auto val = h_outIdx[i]; // indices must be in the given range ASSERT_TRUE(0 <= val && val < params.len) - << "out-of-range index @i=" << i << " val=" << val - << " sampledLen=" << params.sampledLen; + << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen; // indices should not repeat ASSERT_TRUE(occurence.find(val) == occurence.end()) << "repeated index @i=" << i << " idx=" << val; @@ -244,9 +250,7 @@ TEST_P(SWoRTestD, Result) { } // if there's a skewed distribution, the top index should correspond to the // particular item with a large weight - if (params.largeWeightIndex >= 0) { - ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); - } + if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); } } INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd)); diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index a5f08489f1..d7e11e8fef 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -44,12 +44,10 @@ struct CSRAddInputs { }; template -class CSRAddTest - : public ::testing::TestWithParam> { +class CSRAddTest : public ::testing::TestWithParam> { public: CSRAddTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), ind_a(params.matrix_a.row_ind.size(), stream), ind_ptr_a(params.matrix_a.row_ind_ptr.size(), stream), @@ -62,59 +60,69 @@ class CSRAddTest values_verify(params.matrix_verify.row_ind_ptr.size(), stream), ind_result(params.matrix_a.row_ind.size(), stream), ind_ptr_result(params.matrix_verify.row_ind_ptr.size(), stream), - values_result(params.matrix_verify.row_ind_ptr.size(), stream) {} + values_result(params.matrix_verify.row_ind_ptr.size(), stream) + { + } protected: - void SetUp() override { - n_rows = params.matrix_a.row_ind.size(); - nnz_a = params.matrix_a.row_ind_ptr.size(); - nnz_b = params.matrix_b.row_ind_ptr.size(); + void SetUp() override + { + n_rows = params.matrix_a.row_ind.size(); + nnz_a = params.matrix_a.row_ind_ptr.size(); + nnz_b = params.matrix_b.row_ind_ptr.size(); nnz_result = params.matrix_verify.row_ind_ptr.size(); } - void Run() { - raft::update_device(ind_a.data(), params.matrix_a.row_ind.data(), n_rows, - stream); - raft::update_device(ind_ptr_a.data(), params.matrix_a.row_ind_ptr.data(), - nnz_a, stream); - raft::update_device(values_a.data(), params.matrix_a.values.data(), nnz_a, - stream); - - raft::update_device(ind_b.data(), params.matrix_b.row_ind.data(), n_rows, - stream); - raft::update_device(ind_ptr_b.data(), params.matrix_b.row_ind_ptr.data(), - nnz_b, stream); - raft::update_device(values_b.data(), params.matrix_b.values.data(), nnz_b, - stream); - - raft::update_device(ind_verify.data(), params.matrix_verify.row_ind.data(), - n_rows, stream); - raft::update_device(ind_ptr_verify.data(), - params.matrix_verify.row_ind_ptr.data(), nnz_result, - stream); - raft::update_device(values_verify.data(), - params.matrix_verify.values.data(), nnz_result, stream); - - Index_ nnz = linalg::csr_add_calc_inds( - ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(), - ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(), - stream); + void Run() + { + raft::update_device(ind_a.data(), params.matrix_a.row_ind.data(), n_rows, stream); + raft::update_device(ind_ptr_a.data(), params.matrix_a.row_ind_ptr.data(), nnz_a, stream); + raft::update_device(values_a.data(), params.matrix_a.values.data(), nnz_a, stream); + + raft::update_device(ind_b.data(), params.matrix_b.row_ind.data(), n_rows, stream); + raft::update_device(ind_ptr_b.data(), params.matrix_b.row_ind_ptr.data(), nnz_b, stream); + raft::update_device(values_b.data(), params.matrix_b.values.data(), nnz_b, stream); + + raft::update_device(ind_verify.data(), params.matrix_verify.row_ind.data(), n_rows, stream); + raft::update_device( + ind_ptr_verify.data(), params.matrix_verify.row_ind_ptr.data(), nnz_result, stream); + raft::update_device( + values_verify.data(), params.matrix_verify.values.data(), nnz_result, stream); + + Index_ nnz = linalg::csr_add_calc_inds(ind_a.data(), + ind_ptr_a.data(), + values_a.data(), + nnz_a, + ind_b.data(), + ind_ptr_b.data(), + values_b.data(), + nnz_b, + n_rows, + ind_result.data(), + stream); ASSERT_TRUE(nnz == nnz_result); - ASSERT_TRUE(raft::devArrMatch(ind_verify.data(), ind_result.data(), - n_rows, raft::Compare())); - - linalg::csr_add_finalize( - ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(), - ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(), - ind_ptr_result.data(), values_result.data(), stream); - - ASSERT_TRUE(raft::devArrMatch(ind_ptr_verify.data(), - ind_ptr_result.data(), nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(values_verify.data(), - values_result.data(), nnz, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch( + ind_verify.data(), ind_result.data(), n_rows, raft::Compare())); + + linalg::csr_add_finalize(ind_a.data(), + ind_ptr_a.data(), + values_a.data(), + nnz_a, + ind_b.data(), + ind_ptr_b.data(), + values_b.data(), + nnz_b, + n_rows, + ind_result.data(), + ind_ptr_result.data(), + values_result.data(), + stream); + + ASSERT_TRUE(raft::devArrMatch( + ind_ptr_verify.data(), ind_ptr_result.data(), nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch( + values_verify.data(), values_result.data(), nnz, raft::Compare())); } protected: @@ -123,8 +131,8 @@ class CSRAddTest CSRAddInputs params; Index_ n_rows, nnz_a, nnz_b, nnz_result; - rmm::device_uvector ind_a, ind_b, ind_verify, ind_result, ind_ptr_a, - ind_ptr_b, ind_ptr_verify, ind_ptr_result; + rmm::device_uvector ind_a, ind_b, ind_verify, ind_result, ind_ptr_a, ind_ptr_b, + ind_ptr_verify, ind_ptr_result; rmm::device_uvector values_a, values_b, values_verify, values_result; }; @@ -157,10 +165,8 @@ const std::vector> csradd_inputs_d = { {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}}, }; -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, - ::testing::ValuesIn(csradd_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, - ::testing::ValuesIn(csradd_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, ::testing::ValuesIn(csradd_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, ::testing::ValuesIn(csradd_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu index dd6ba1479e..5e4b164b37 100644 --- a/cpp/test/sparse/connect_components.cu +++ b/cpp/test/sparse/connect_components.cu @@ -50,24 +50,22 @@ struct ConnectComponentsInputs { }; template -class ConnectComponentsTest : public ::testing::TestWithParam< - ConnectComponentsInputs> { +class ConnectComponentsTest + : public ::testing::TestWithParam> { protected: - void basicTest() { + void basicTest() + { raft::handle_t handle; auto stream = handle.get_stream(); - params = ::testing::TestWithParam< - ConnectComponentsInputs>::GetParam(); + params = ::testing::TestWithParam>::GetParam(); raft::sparse::COO out_edges(handle.get_stream()); - rmm::device_uvector data(params.n_row * params.n_col, - handle.get_stream()); + rmm::device_uvector data(params.n_row * params.n_col, handle.get_stream()); - raft::copy(data.data(), params.data.data(), data.size(), - handle.get_stream()); + raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream()); rmm::device_uvector indptr(params.n_row + 1, stream); @@ -76,44 +74,58 @@ class ConnectComponentsTest : public ::testing::TestWithParam< */ raft::sparse::COO knn_graph_coo(stream); - raft::sparse::selection::knn_graph( - handle, data.data(), params.n_row, params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, knn_graph_coo, params.c); + raft::sparse::selection::knn_graph(handle, + data.data(), + params.n_row, + params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, + knn_graph_coo, + params.c); - raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(), - knn_graph_coo.nnz, indptr.data(), - params.n_row + 1, stream); + raft::sparse::convert::sorted_coo_to_csr( + knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, stream); /** * 2. Construct MST, sorted by weights */ rmm::device_uvector colors(params.n_row, stream); - auto mst_coo = raft::mst::mst( - handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(), - params.n_row, knn_graph_coo.nnz, colors.data(), stream, false, true); + auto mst_coo = raft::mst::mst(handle, + indptr.data(), + knn_graph_coo.cols(), + knn_graph_coo.vals(), + params.n_row, + knn_graph_coo.nnz, + colors.data(), + stream, + false, + true); /** * 3. connect_components to fix connectivities */ - raft::linkage::FixConnectivitiesRedOp red_op( - colors.data(), params.n_row); + raft::linkage::FixConnectivitiesRedOp red_op(colors.data(), params.n_row); raft::linkage::connect_components( - handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, - red_op); + handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op); /** * Construct final edge list */ rmm::device_uvector indptr2(params.n_row + 1, stream); - raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz, - indptr2.data(), params.n_row + 1, - stream); + raft::sparse::convert::sorted_coo_to_csr( + out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, stream); - auto output_mst = raft::mst::mst( - handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row, - out_edges.nnz, colors.data(), stream, false, false); + auto output_mst = raft::mst::mst(handle, + indptr2.data(), + out_edges.cols(), + out_edges.vals(), + params.n_row, + out_edges.nnz, + colors.data(), + stream, + false, + false); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -135,366 +147,199 @@ const std::vector> fix_conn_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, -1}, // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, - 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, - 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, - 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, - 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, - 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, - 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, - 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, - 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, - 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, - 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, - 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, - 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, - 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, - 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, - 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, - 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, - 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, - 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, - 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, - 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, - 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, - 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, - 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, - 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, - 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, - 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, - 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, - 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, - 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, - 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, - 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, - 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, - 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, - 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, - 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, - 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, - 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, - 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, - 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, - 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, - 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, - 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, - 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, - 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, - 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, - 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, - 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, - 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, - 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, - 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, - 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, - 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, - 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, - 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, - 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, - 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, - 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, - 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, - 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, - 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, - 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, - 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, - 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, - 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, - 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, - 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, - 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, - 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, - 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, - 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, - 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, - 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, - 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, - 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, - 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, - 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, - 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, - 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, - 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, - 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, - 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, - 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, - 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, - 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, - 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, - 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, - 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, - 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, - 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, - 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, - 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, - 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, - 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, - 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, - 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, - 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, - 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, - 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, - 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, - 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, - 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, - 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, - 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, - 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, - 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, - 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, - 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, - 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, - 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, - 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, - 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, - 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, - 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, - 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, - 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, - 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, - 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, - 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, - 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, - 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, - 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, - 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, - 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, - 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, - 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, - 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, - 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, - 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, - 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, - 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, - 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, - 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, - 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, - 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, - 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, - 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, - 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, - 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, - 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, - 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, - 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, - 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, - 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, - 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, - 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, - 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, - 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, - 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, - 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, - 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, - 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, - 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, - 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, - 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, - 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, - 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, - 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, - 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, - 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, - 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, - 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, - 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, - 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, - 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, - 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, - 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, - 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 }, -4}}; typedef ConnectComponentsTest ConnectComponentsTestF_Int; -TEST_P(ConnectComponentsTestF_Int, Result) { +TEST_P(ConnectComponentsTestF_Int, Result) +{ /** - * Verify the src & dst vertices on each edge have different colors - */ + * Verify the src & dst vertices on each edge have different colors + */ EXPECT_TRUE(final_edges == params.n_row - 1); } -INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, ConnectComponentsTestF_Int, +INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, + ConnectComponentsTestF_Int, ::testing::ValuesIn(fix_conn_inputsf2)); }; // namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index d30114bbcb..2028513010 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -44,23 +44,25 @@ class CSRtoCOOTest : public ::testing::TestWithParam> { stream(handle.get_stream()), ex_scan(params.ex_scan.size(), stream), verify(params.verify.size(), stream), - result(params.verify.size(), stream) {} + result(params.verify.size(), stream) + { + } protected: void SetUp() override {} - void Run() { + void Run() + { Index_ n_rows = params.ex_scan.size(); - Index_ nnz = params.verify.size(); + Index_ nnz = params.verify.size(); raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream); raft::update_device(verify.data(), params.verify.data(), nnz, stream); - convert::csr_to_coo(ex_scan.data(), n_rows, result.data(), nnz, - stream); + convert::csr_to_coo(ex_scan.data(), n_rows, result.data(), nnz, stream); - ASSERT_TRUE(raft::devArrMatch(verify.data(), result.data(), nnz, - raft::Compare(), stream)); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), result.data(), nnz, raft::Compare(), stream)); } protected: @@ -86,9 +88,11 @@ const std::vector> csrtocoo_inputs_64 = { {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, + CSRtoCOOTestI, ::testing::ValuesIn(csrtocoo_inputs_32)); -INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, + CSRtoCOOTestL, ::testing::ValuesIn(csrtocoo_inputs_64)); } // namespace sparse diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index cd665934c2..18e8b874bb 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -36,14 +36,13 @@ struct SparseConvertCSRInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const SparseConvertCSRInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseConvertCSRInputs& dims) +{ return os; } template -class SparseConvertCSRTest - : public ::testing::TestWithParam> { +class SparseConvertCSRTest : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -53,18 +52,18 @@ class SparseConvertCSRTest SparseConvertCSRInputs params; }; -const std::vector> inputsf = { - {5, 10, 5, 1234ULL}}; +const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseConvertCSRTest SortedCOOToCSR; -TEST_P(SortedCOOToCSR, Result) { +TEST_P(SortedCOOToCSR, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); int nnz = 8; - int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int *exp_h = new int[4]{0, 2, 4, 6}; + int* in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int* exp_h = new int[4]{0, 2, 4, 6}; rmm::device_uvector in(nnz, stream); rmm::device_uvector exp(4, stream); @@ -78,8 +77,7 @@ TEST_P(SortedCOOToCSR, Result) { convert::sorted_coo_to_csr(in.data(), nnz, out.data(), 4, stream); - ASSERT_TRUE( - raft::devArrMatch(out.data(), exp.data(), 4, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.data(), exp.data(), 4, raft::Compare())); cudaStreamDestroy(stream); @@ -87,8 +85,7 @@ TEST_P(SortedCOOToCSR, Result) { delete[] exp_h; } -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf)); /******************************** adj graph ********************************/ @@ -102,8 +99,7 @@ struct CSRAdjGraphInputs { }; template -class CSRAdjGraphTest - : public ::testing::TestWithParam> { +class CSRAdjGraphTest : public ::testing::TestWithParam> { public: CSRAdjGraphTest() : params(::testing::TestWithParam>::GetParam()), @@ -111,24 +107,27 @@ class CSRAdjGraphTest row_ind(params.n_rows, stream), adj(params.n_rows * params.n_cols, stream), result(params.verify.size(), stream), - verify(params.verify.size(), stream) {} + verify(params.verify.size(), stream) + { + } protected: void SetUp() override { nnz = params.verify.size(); } - void Run() { - raft::update_device(row_ind.data(), params.row_ind.data(), params.n_rows, + void Run() + { + raft::update_device(row_ind.data(), params.row_ind.data(), params.n_rows, stream); + raft::update_device(adj.data(), + reinterpret_cast(params.adj.data()), + params.n_rows * params.n_cols, stream); - raft::update_device(adj.data(), reinterpret_cast(params.adj.data()), - params.n_rows * params.n_cols, stream); raft::update_device(verify.data(), params.verify.data(), nnz, stream); - convert::csr_adj_graph_batched(row_ind.data(), params.n_cols, - nnz, params.n_rows, adj.data(), - result.data(), stream); + convert::csr_adj_graph_batched( + row_ind.data(), params.n_cols, nnz, params.n_rows, adj.data(), result.data(), stream); - ASSERT_TRUE(raft::devArrMatch(verify.data(), result.data(), nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), result.data(), nnz, raft::Compare())); } protected: @@ -162,9 +161,11 @@ const std::vector> csradjgraph_inputs_l = { {0, 1, 2, 0, 1, 2, 0, 1, 2}}, }; -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, + CSRAdjGraphTestI, ::testing::ValuesIn(csradjgraph_inputs_i)); -INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL, +INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, + CSRAdjGraphTestL, ::testing::ValuesIn(csradjgraph_inputs_l)); } // namespace sparse diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu index 33893649bd..16372dc0f6 100644 --- a/cpp/test/sparse/csr_row_slice.cu +++ b/cpp/test/sparse/csr_row_slice.cu @@ -47,18 +47,16 @@ struct CSRRowSliceInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRRowSliceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRRowSliceInputs& dims) +{ return os; } template -class CSRRowSliceTest - : public ::testing::TestWithParam> { +class CSRRowSliceTest : public ::testing::TestWithParam> { public: CSRRowSliceTest() - : params(::testing::TestWithParam< - CSRRowSliceInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), indptr(0, stream), indices(0, stream), @@ -68,7 +66,8 @@ class CSRRowSliceTest out_data_ref(0, stream), out_indptr(0, stream), out_indices(0, stream), - out_data(0, stream) { + out_data(0, stream) + { indptr.resize(params.indptr_h.size(), stream); indices.resize(params.indices_h.size(), stream); data.resize(params.data_h.size(), stream); @@ -81,54 +80,65 @@ class CSRRowSliceTest } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); update_device(indices.data(), indices_h.data(), indices_h.size(), stream); update_device(data.data(), data_h.data(), data_h.size(), stream); - std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; - - update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), - out_indptr_ref_h.size(), stream); - update_device(out_indices_ref.data(), out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_data_ref.data(), out_data_ref_h.data(), - out_data_ref_h.size(), stream); + std::vector out_data_ref_h = params.out_data_ref_h; + + update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void SetUp() override { + void SetUp() override + { make_data(); int csr_start_offset; int csr_stop_offset; - raft::sparse::op::csr_row_slice_indptr( - params.start_row, params.stop_row, indptr.data(), out_indptr.data(), - &csr_start_offset, &csr_stop_offset, stream); - - raft::sparse::op::csr_row_slice_populate( - csr_start_offset, csr_stop_offset, indices.data(), data.data(), - out_indices.data(), out_data.data(), stream); + raft::sparse::op::csr_row_slice_indptr(params.start_row, + params.stop_row, + indptr.data(), + out_indptr.data(), + &csr_start_offset, + &csr_stop_offset, + stream); + + raft::sparse::op::csr_row_slice_populate(csr_start_offset, + csr_stop_offset, + indices.data(), + data.data(), + out_indices.data(), + out_data.data(), + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(), + void compare() + { + ASSERT_TRUE(devArrMatch(out_indptr.data(), + out_indptr_ref.data(), params.out_indptr_ref_h.size(), Compare())); - ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(), + ASSERT_TRUE(devArrMatch(out_indices.data(), + out_indices_ref.data(), params.out_indices_ref_h.size(), Compare())); - ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(), - params.out_data_ref_h.size(), Compare())); + ASSERT_TRUE(devArrMatch( + out_data.data(), out_data_ref.data(), params.out_data_ref_h.size(), Compare())); } protected: @@ -173,8 +183,7 @@ const std::vector> inputs_i32_f = { }; typedef CSRRowSliceTest CSRRowSliceTestF; TEST_P(CSRRowSliceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu index 1a206c8499..85f00cdd27 100644 --- a/cpp/test/sparse/csr_to_dense.cu +++ b/cpp/test/sparse/csr_to_dense.cu @@ -45,24 +45,23 @@ struct CSRToDenseInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRToDenseInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRToDenseInputs& dims) +{ return os; } template -class CSRToDenseTest - : public ::testing::TestWithParam> { +class CSRToDenseTest : public ::testing::TestWithParam> { public: CSRToDenseTest() - : params(::testing::TestWithParam< - CSRToDenseInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(raft_handle.get_stream()), indptr(0, stream), indices(0, stream), data(0, stream), out_ref(0, stream), - out(0, stream) { + out(0, stream) + { indptr.resize(params.indptr_h.size(), stream); indices.resize(params.indices_h.size(), stream); data.resize(params.data_h.size(), stream); @@ -71,10 +70,11 @@ class CSRToDenseTest } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); update_device(indices.data(), indices_h.data(), indices_h.size(), stream); @@ -86,22 +86,31 @@ class CSRToDenseTest CUDA_CHECK(cudaStreamSynchronize(stream)); } - void SetUp() override { + void SetUp() override + { CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - convert::csr_to_dense(handle, params.nrows, params.ncols, indptr.data(), - indices.data(), data.data(), params.nrows, out.data(), - stream, true); + convert::csr_to_dense(handle, + params.nrows, + params.ncols, + indptr.data(), + indices.data(), + data.data(), + params.nrows, + out.data(), + stream, + true); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void compare() { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.out_ref_h.size(), - Compare())); + void compare() + { + ASSERT_TRUE( + devArrMatch(out.data(), out_ref.data(), params.out_ref_h.size(), Compare())); } protected: @@ -129,13 +138,26 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 2, 3, 0, 1, 2, 3}, // indices {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f}, - {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 16.0f, 2.0f}}, + {1.0f, + 3.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 1.0f, + 5.0f, + 50.0f, + 28.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 16.0f, + 2.0f}}, }; typedef CSRToDenseTest CSRToDenseTestF; TEST_P(CSRToDenseTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu index 8983f10d2b..3380eaa6fb 100644 --- a/cpp/test/sparse/csr_transpose.cu +++ b/cpp/test/sparse/csr_transpose.cu @@ -47,18 +47,16 @@ struct CSRTransposeInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const CSRTransposeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const CSRTransposeInputs& dims) +{ return os; } template -class CSRTransposeTest - : public ::testing::TestWithParam> { +class CSRTransposeTest : public ::testing::TestWithParam> { public: CSRTransposeTest() - : params(::testing::TestWithParam< - CSRTransposeInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(raft_handle.get_stream()), indptr(0, stream), indices(0, stream), @@ -68,7 +66,8 @@ class CSRTransposeTest out_data_ref(0, stream), out_indptr(0, stream), out_indices(0, stream), - out_data(0, stream) { + out_data(0, stream) + { indptr.resize(params.indptr_h.size(), stream); indices.resize(params.indices_h.size(), stream); data.resize(params.data_h.size(), stream); @@ -81,50 +80,60 @@ class CSRTransposeTest } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream); update_device(indices.data(), indices_h.data(), indices_h.size(), stream); update_device(data.data(), data_h.data(), data_h.size(), stream); - std::vector out_indptr_ref_h = params.out_indptr_ref_h; + std::vector out_indptr_ref_h = params.out_indptr_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; - std::vector out_data_ref_h = params.out_data_ref_h; - - update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), - out_indptr_ref_h.size(), stream); - update_device(out_indices_ref.data(), out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_data_ref.data(), out_data_ref_h.data(), - out_data_ref_h.size(), stream); + std::vector out_data_ref_h = params.out_data_ref_h; + + update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream); + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream); } - void SetUp() override { + void SetUp() override + { CUSPARSE_CHECK(cusparseCreate(&handle)); make_data(); - raft::sparse::linalg::csr_transpose( - handle, indptr.data(), indices.data(), data.data(), out_indptr.data(), - out_indices.data(), out_data.data(), params.nrows, params.ncols, - params.nnz, stream); + raft::sparse::linalg::csr_transpose(handle, + indptr.data(), + indices.data(), + data.data(), + out_indptr.data(), + out_indices.data(), + out_data.data(), + params.nrows, + params.ncols, + params.nnz, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); CUSPARSE_CHECK(cusparseDestroy(handle)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(), + void compare() + { + ASSERT_TRUE(devArrMatch(out_indptr.data(), + out_indptr_ref.data(), params.out_indptr_ref_h.size(), Compare())); - ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(), + ASSERT_TRUE(devArrMatch(out_indices.data(), + out_indices_ref.data(), params.out_indices_ref_h.size(), Compare())); - ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(), - params.out_data_ref_h.size(), Compare())); + ASSERT_TRUE(devArrMatch( + out_data.data(), out_data_ref.data(), params.out_data_ref_h.size(), Compare())); } protected: @@ -163,8 +172,7 @@ const std::vector> inputs_i32_f = { }; typedef CSRTransposeTest CSRTransposeTestF; TEST_P(CSRTransposeTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace sparse }; // end namespace raft diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index fbadadb29d..8b1c7988d6 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -33,8 +33,7 @@ struct SparseDegreeInputs { }; template -class SparseDegreeTests - : public ::testing::TestWithParam> { +class SparseDegreeTests : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -47,20 +46,19 @@ class SparseDegreeTests const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseDegreeTests COODegree; -TEST_P(COODegree, Result) { +TEST_P(COODegree, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); int in_rows_h[5] = {0, 0, 1, 2, 2}; - int verify_h[5] = {2, 1, 2, 0, 0}; + int verify_h[5] = {2, 1, 2, 0, 0}; rmm::device_uvector in_rows(5, stream); rmm::device_uvector verify(5, stream); rmm::device_uvector results(5, stream); - CUDA_CHECK( - cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); raft::update_device(in_rows.data(), *&in_rows_h, 5, stream); raft::update_device(verify.data(), *&verify_h, 5, stream); @@ -68,50 +66,43 @@ TEST_P(COODegree, Result) { linalg::coo_degree<32>(in_rows.data(), 5, results.data(), stream); cudaDeviceSynchronize(); - ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, raft::Compare())); CUDA_CHECK(cudaStreamDestroy(stream)); } typedef SparseDegreeTests COODegreeNonzero; -TEST_P(COODegreeNonzero, Result) { +TEST_P(COODegreeNonzero, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); - int in_rows_h[5] = {0, 0, 1, 2, 2}; + int in_rows_h[5] = {0, 0, 1, 2, 2}; float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0}; - int verify_h[5] = {1, 0, 2, 0, 0}; + int verify_h[5] = {1, 0, 2, 0, 0}; rmm::device_uvector in_rows(5, stream); rmm::device_uvector verify(5, stream); rmm::device_uvector results(5, stream); rmm::device_uvector in_vals(5, stream); - CUDA_CHECK( - cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); - CUDA_CHECK( - cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream)); + CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream)); raft::update_device(in_rows.data(), *&in_rows_h, 5, stream); raft::update_device(verify.data(), *&verify_h, 5, stream); raft::update_device(in_vals.data(), *&in_vals_h, 5, stream); - linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5, - results.data(), stream); + linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5, results.data(), stream); cudaDeviceSynchronize(); - ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(verify.data(), results.data(), 5, raft::Compare())); CUDA_CHECK(cudaStreamDestroy(stream)); } -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, - ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu index d24199c5fc..000d58d029 100644 --- a/cpp/test/sparse/dist_coo_spmv.cu +++ b/cpp/test/sparse/dist_coo_spmv.cu @@ -55,28 +55,26 @@ struct InputConfiguration { }; using dense_smem_strategy_t = detail::dense_smem_strategy; -using hash_strategy_t = detail::hash_strategy; +using hash_strategy_t = detail::hash_strategy; template struct SparseDistanceCOOSPMVInputs { InputConfiguration input_configuration; float capacity_threshold = 0.5; - int map_size = - detail::hash_strategy::get_map_size(); + int map_size = detail::hash_strategy::get_map_size(); }; template -::std::ostream &operator<<( - ::std::ostream &os, - const SparseDistanceCOOSPMVInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseDistanceCOOSPMVInputs& dims) +{ return os; } template class SparseDistanceCOOSPMVTest - : public ::testing::TestWithParam< - SparseDistanceCOOSPMVInputs> { + : public ::testing::TestWithParam> { public: SparseDistanceCOOSPMVTest() : dist_config(handle), @@ -84,62 +82,74 @@ class SparseDistanceCOOSPMVTest indices(0, handle.get_stream()), data(0, handle.get_stream()), out_dists(0, handle.get_stream()), - out_dists_ref(0, handle.get_stream()) {} + out_dists_ref(0, handle.get_stream()) + { + } - template > * = nullptr> - U make_strategy() { + template >* = nullptr> + U make_strategy() + { return strategy_t(dist_config, params.capacity_threshold, params.map_size); } - template > * = nullptr> - U make_strategy() { + template >* = nullptr> + U make_strategy() + { return strategy_t(dist_config); } template - void compute_dist(reduce_f reduce_func, accum_f accum_func, - write_f write_func, bool rev = true) { - rmm::device_uvector coo_rows( - max(dist_config.b_nnz, dist_config.a_nnz), - dist_config.handle.get_stream()); - - raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows, - coo_rows.data(), dist_config.b_nnz, + void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true) + { + rmm::device_uvector coo_rows(max(dist_config.b_nnz, dist_config.a_nnz), + dist_config.handle.get_stream()); + + raft::sparse::convert::csr_to_coo(dist_config.b_indptr, + dist_config.b_nrows, + coo_rows.data(), + dist_config.b_nnz, dist_config.handle.get_stream()); strategy_t selected_strategy = make_strategy(); - detail::balanced_coo_pairwise_generalized_spmv( - out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func, - write_func, selected_strategy); + detail::balanced_coo_pairwise_generalized_spmv(out_dists.data(), + dist_config, + coo_rows.data(), + reduce_func, + accum_func, + write_func, + selected_strategy); if (rev) { - raft::sparse::convert::csr_to_coo( - dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(), - dist_config.a_nnz, dist_config.handle.get_stream()); - - detail::balanced_coo_pairwise_generalized_spmv_rev( - out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func, - write_func, selected_strategy); + raft::sparse::convert::csr_to_coo(dist_config.a_indptr, + dist_config.a_nrows, + coo_rows.data(), + dist_config.a_nnz, + dist_config.handle.get_stream()); + + detail::balanced_coo_pairwise_generalized_spmv_rev(out_dists.data(), + dist_config, + coo_rows.data(), + reduce_func, + accum_func, + write_func, + selected_strategy); } } - void run_spmv() { + void run_spmv() + { switch (params.input_configuration.metric) { case raft::distance::DistanceType::InnerProduct: - compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(), - true); + compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(), true); break; case raft::distance::DistanceType::L2Unexpanded: compute_dist(detail::SqDiff(), detail::Sum(), detail::AtomicAdd()); break; case raft::distance::DistanceType::Canberra: compute_dist( - [] __device__(value_t a, value_t b) { - return fabsf(a - b) / (fabsf(a) + fabsf(b)); - }, - detail::Sum(), detail::AtomicAdd()); + [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); }, + detail::Sum(), + detail::AtomicAdd()); break; case raft::distance::DistanceType::L1: compute_dist(detail::AbsDiff(), detail::Sum(), detail::AtomicAdd()); @@ -148,26 +158,27 @@ class SparseDistanceCOOSPMVTest compute_dist(detail::AbsDiff(), detail::Max(), detail::AtomicMax()); break; case raft::distance::DistanceType::LpUnexpanded: { - compute_dist(detail::PDiff(params.input_configuration.metric_arg), - detail::Sum(), detail::AtomicAdd()); + compute_dist( + detail::PDiff(params.input_configuration.metric_arg), detail::Sum(), detail::AtomicAdd()); float p = 1.0f / params.input_configuration.metric_arg; raft::linalg::unaryOp( - out_dists.data(), out_dists.data(), + out_dists.data(), + out_dists.data(), dist_config.a_nrows * dist_config.b_nrows, [=] __device__(value_t input) { return powf(input, p); }, dist_config.handle.get_stream()); } break; - default: - throw raft::exception("Unknown distance"); + default: throw raft::exception("Unknown distance"); } } protected: - void make_data() { - std::vector indptr_h = params.input_configuration.indptr_h; + void make_data() + { + std::vector indptr_h = params.input_configuration.indptr_h; std::vector indices_h = params.input_configuration.indices_h; - std::vector data_h = params.input_configuration.data_h; + std::vector data_h = params.input_configuration.data_h; auto stream = handle.get_stream(); indptr.resize(indptr_h.size(), stream); @@ -178,33 +189,32 @@ class SparseDistanceCOOSPMVTest update_device(indices.data(), indices_h.data(), indices_h.size(), stream); update_device(data.data(), data_h.data(), data_h.size(), stream); - std::vector out_dists_ref_h = - params.input_configuration.out_dists_ref_h; + std::vector out_dists_ref_h = params.input_configuration.out_dists_ref_h; out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream); - update_device(out_dists_ref.data(), out_dists_ref_h.data(), - out_dists_ref_h.size(), stream); + update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream); } - void SetUp() override { + void SetUp() override + { params = ::testing::TestWithParam< SparseDistanceCOOSPMVInputs>::GetParam(); make_data(); - dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.b_ncols = params.input_configuration.n_cols; - dist_config.b_nnz = params.input_configuration.indices_h.size(); - dist_config.b_indptr = indptr.data(); + dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.b_ncols = params.input_configuration.n_cols; + dist_config.b_nnz = params.input_configuration.indices_h.size(); + dist_config.b_indptr = indptr.data(); dist_config.b_indices = indices.data(); - dist_config.b_data = data.data(); - dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; - dist_config.a_ncols = params.input_configuration.n_cols; - dist_config.a_nnz = params.input_configuration.indices_h.size(); - dist_config.a_indptr = indptr.data(); + dist_config.b_data = data.data(); + dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1; + dist_config.a_ncols = params.input_configuration.n_cols; + dist_config.a_nnz = params.input_configuration.indices_h.size(); + dist_config.a_indptr = indptr.data(); dist_config.a_indices = indices.data(); - dist_config.a_data = data.data(); + dist_config.a_data = data.data(); int out_size = dist_config.a_nrows * dist_config.b_nrows; @@ -215,8 +225,10 @@ class SparseDistanceCOOSPMVTest CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref.data(), + out_dists.data(), params.input_configuration.out_dists_ref_h.size(), CompareApprox(1e-3))); } @@ -241,8 +253,7 @@ const InputConfiguration input_inner_product = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, - 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}; @@ -273,384 +284,379 @@ const InputConfiguration input_l2_unexpanded = { raft::distance::DistanceType::L2Unexpanded, 0.0}; -const InputConfiguration input_canberra = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 3.3954660629919076, - 5.6469232737388815, - 6.373112846266441, - 4.0212880272531715, - 6.916281504639404, - 5.741508386786526, - 5.411470999663036, - 9.0, - 4.977014354725805, - 3.3954660629919076, - 0.0, - 7.56256082439209, - 5.540261147481582, - 4.832322929216881, - 4.62003193872216, - 6.498056792320361, - 4.309846252268695, - 6.317531174829905, - 6.016362684141827, - 5.6469232737388815, - 7.56256082439209, - 0.0, - 5.974878731322299, - 4.898357301336036, - 6.442097410320605, - 5.227077347287883, - 7.134101195584642, - 5.457753923371659, - 7.0, - 6.373112846266441, - 5.540261147481582, - 5.974878731322299, - 0.0, - 5.5507273748583, - 4.897749658726415, - 9.0, - 8.398776718824767, - 3.908281400328807, - 4.83431066343688, - 4.0212880272531715, - 4.832322929216881, - 4.898357301336036, - 5.5507273748583, - 0.0, - 6.632989819428174, - 7.438852294822894, - 5.6631570310967465, - 7.579428202635459, - 6.760811985364303, - 6.916281504639404, - 4.62003193872216, - 6.442097410320605, - 4.897749658726415, - 6.632989819428174, - 0.0, - 5.249404187382862, - 6.072559523278559, - 4.07661278488929, - 6.19678948003145, - 5.741508386786526, - 6.498056792320361, - 5.227077347287883, - 9.0, - 7.438852294822894, - 5.249404187382862, - 0.0, - 3.854811639654704, - 6.652724827169063, - 5.298236851430971, - 5.411470999663036, - 4.309846252268695, - 7.134101195584642, - 8.398776718824767, - 5.6631570310967465, - 6.072559523278559, - 3.854811639654704, - 0.0, - 7.529184598969917, - 6.903282911791188, - 9.0, - 6.317531174829905, - 5.457753923371659, - 3.908281400328807, - 7.579428202635459, - 4.07661278488929, - 6.652724827169063, - 7.529184598969917, - 0.0, - 7.0, - 4.977014354725805, - 6.016362684141827, - 7.0, - 4.83431066343688, - 6.760811985364303, - 6.19678948003145, - 5.298236851430971, - 6.903282911791188, - 7.0, - 0.0}, - raft::distance::DistanceType::Canberra, - 0.0}; - -const InputConfiguration input_lp_unexpanded = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 1.31462855332296, - 1.3690307816129905, - 1.698603990921237, - 1.3460470789553531, - 1.6636670712582544, - 1.2651744044972217, - 1.1938329352055201, - 1.8811409082590185, - 1.3653115050624267, - 1.31462855332296, - 0.0, - 1.9447722703291133, - 1.42818777206562, - 1.4685491458946494, - 1.3071999866010466, - 1.4988622861692171, - 0.9698559287406783, - 1.4972023224597841, - 1.5243383567266802, - 1.3690307816129905, - 1.9447722703291133, - 0.0, - 1.2748400840107568, - 1.0599569946448246, - 1.546591282841402, - 1.147526531928459, - 1.447002179128145, - 1.5982242387673176, - 1.3112533607072414, - 1.698603990921237, - 1.42818777206562, - 1.2748400840107568, - 0.0, - 1.038121552545461, - 1.011788365364402, - 1.3907391109256988, - 1.3128200942311496, - 1.19595706584447, - 1.3233328139624725, - 1.3460470789553531, - 1.4685491458946494, - 1.0599569946448246, - 1.038121552545461, - 0.0, - 1.3642741698145529, - 1.3493868683808095, - 1.394942694628328, - 1.572881849642552, - 1.380122665319464, - 1.6636670712582544, - 1.3071999866010466, - 1.546591282841402, - 1.011788365364402, - 1.3642741698145529, - 0.0, - 1.018961640373018, - 1.0114394258945634, - 0.8338711034820684, - 1.1247823842299223, - 1.2651744044972217, - 1.4988622861692171, - 1.147526531928459, - 1.3907391109256988, - 1.3493868683808095, - 1.018961640373018, - 0.0, - 0.7701238110357329, - 1.245486437864406, - 0.5551259549534626, - 1.1938329352055201, - 0.9698559287406783, - 1.447002179128145, - 1.3128200942311496, - 1.394942694628328, - 1.0114394258945634, - 0.7701238110357329, - 0.0, - 1.1886800117391216, - 1.0083692448135637, - 1.8811409082590185, - 1.4972023224597841, - 1.5982242387673176, - 1.19595706584447, - 1.572881849642552, - 0.8338711034820684, - 1.245486437864406, - 1.1886800117391216, - 0.0, - 1.3661374102525012, - 1.3653115050624267, - 1.5243383567266802, - 1.3112533607072414, - 1.3233328139624725, - 1.380122665319464, - 1.1247823842299223, - 0.5551259549534626, - 1.0083692448135637, - 1.3661374102525012, - 0.0}, - raft::distance::DistanceType::LpUnexpanded, - 2.0}; - -const InputConfiguration input_linf = - {10, - {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0.0, - 0.9251771844789913, - 0.9036452083899731, - 0.9251771844789913, - 0.8706483735804971, - 0.9251771844789913, - 0.717493881903289, - 0.6920214832303888, - 0.9251771844789913, - 0.9251771844789913, - 0.9251771844789913, - 0.0, - 0.9036452083899731, - 0.8655339692155823, - 0.8706483735804971, - 0.8655339692155823, - 0.8655339692155823, - 0.6329837991017668, - 0.8655339692155823, - 0.8655339692155823, - 0.9036452083899731, - 0.9036452083899731, - 0.0, - 0.7988276152181608, - 0.7028075145996631, - 0.9036452083899731, - 0.9036452083899731, - 0.9036452083899731, - 0.8429599432532096, - 0.9036452083899731, - 0.9251771844789913, - 0.8655339692155823, - 0.7988276152181608, - 0.0, - 0.48376552205293305, - 0.8206394616536681, - 0.8206394616536681, - 0.8206394616536681, - 0.8429599432532096, - 0.8206394616536681, - 0.8706483735804971, - 0.8706483735804971, - 0.7028075145996631, - 0.48376552205293305, - 0.0, - 0.8706483735804971, - 0.8706483735804971, - 0.8706483735804971, - 0.8429599432532096, - 0.8706483735804971, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.0, - 0.8853924473642432, - 0.535821510936138, - 0.6497196601457607, - 0.8853924473642432, - 0.717493881903289, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.0, - 0.5279604218147174, - 0.6658348373853169, - 0.33799874888632914, - 0.6920214832303888, - 0.6329837991017668, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.535821510936138, - 0.5279604218147174, - 0.0, - 0.662579808115858, - 0.5079750812968089, - 0.9251771844789913, - 0.8655339692155823, - 0.8429599432532096, - 0.8429599432532096, - 0.8429599432532096, - 0.6497196601457607, - 0.6658348373853169, - 0.662579808115858, - 0.0, - 0.8429599432532096, - 0.9251771844789913, - 0.8655339692155823, - 0.9036452083899731, - 0.8206394616536681, - 0.8706483735804971, - 0.8853924473642432, - 0.33799874888632914, - 0.5079750812968089, - 0.8429599432532096, - 0.0}, - raft::distance::DistanceType::Linf, - 0.0}; - -const InputConfiguration input_l1 = { - 4, - {0, 1, 1, 2, 4}, - {3, 2, 0, 1}, // indices - {0.99296, 0.42180, 0.11687, 0.305869}, - { - // dense output - 0.0, - 0.99296, - 1.41476, - 1.415707, - 0.99296, - 0.0, - 0.42180, - 0.42274, - 1.41476, - 0.42180, - 0.0, - 0.84454, - 1.41570, - 0.42274, - 0.84454, - 0.0, - }, - raft::distance::DistanceType::L1, +const InputConfiguration input_canberra = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 3.3954660629919076, + 5.6469232737388815, + 6.373112846266441, + 4.0212880272531715, + 6.916281504639404, + 5.741508386786526, + 5.411470999663036, + 9.0, + 4.977014354725805, + 3.3954660629919076, + 0.0, + 7.56256082439209, + 5.540261147481582, + 4.832322929216881, + 4.62003193872216, + 6.498056792320361, + 4.309846252268695, + 6.317531174829905, + 6.016362684141827, + 5.6469232737388815, + 7.56256082439209, + 0.0, + 5.974878731322299, + 4.898357301336036, + 6.442097410320605, + 5.227077347287883, + 7.134101195584642, + 5.457753923371659, + 7.0, + 6.373112846266441, + 5.540261147481582, + 5.974878731322299, + 0.0, + 5.5507273748583, + 4.897749658726415, + 9.0, + 8.398776718824767, + 3.908281400328807, + 4.83431066343688, + 4.0212880272531715, + 4.832322929216881, + 4.898357301336036, + 5.5507273748583, + 0.0, + 6.632989819428174, + 7.438852294822894, + 5.6631570310967465, + 7.579428202635459, + 6.760811985364303, + 6.916281504639404, + 4.62003193872216, + 6.442097410320605, + 4.897749658726415, + 6.632989819428174, + 0.0, + 5.249404187382862, + 6.072559523278559, + 4.07661278488929, + 6.19678948003145, + 5.741508386786526, + 6.498056792320361, + 5.227077347287883, + 9.0, + 7.438852294822894, + 5.249404187382862, + 0.0, + 3.854811639654704, + 6.652724827169063, + 5.298236851430971, + 5.411470999663036, + 4.309846252268695, + 7.134101195584642, + 8.398776718824767, + 5.6631570310967465, + 6.072559523278559, + 3.854811639654704, + 0.0, + 7.529184598969917, + 6.903282911791188, + 9.0, + 6.317531174829905, + 5.457753923371659, + 3.908281400328807, + 7.579428202635459, + 4.07661278488929, + 6.652724827169063, + 7.529184598969917, + 0.0, + 7.0, + 4.977014354725805, + 6.016362684141827, + 7.0, + 4.83431066343688, + 6.760811985364303, + 6.19678948003145, + 5.298236851430971, + 6.903282911791188, + 7.0, + 0.0}, + raft::distance::DistanceType::Canberra, 0.0}; +const InputConfiguration input_lp_unexpanded = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 1.31462855332296, + 1.3690307816129905, + 1.698603990921237, + 1.3460470789553531, + 1.6636670712582544, + 1.2651744044972217, + 1.1938329352055201, + 1.8811409082590185, + 1.3653115050624267, + 1.31462855332296, + 0.0, + 1.9447722703291133, + 1.42818777206562, + 1.4685491458946494, + 1.3071999866010466, + 1.4988622861692171, + 0.9698559287406783, + 1.4972023224597841, + 1.5243383567266802, + 1.3690307816129905, + 1.9447722703291133, + 0.0, + 1.2748400840107568, + 1.0599569946448246, + 1.546591282841402, + 1.147526531928459, + 1.447002179128145, + 1.5982242387673176, + 1.3112533607072414, + 1.698603990921237, + 1.42818777206562, + 1.2748400840107568, + 0.0, + 1.038121552545461, + 1.011788365364402, + 1.3907391109256988, + 1.3128200942311496, + 1.19595706584447, + 1.3233328139624725, + 1.3460470789553531, + 1.4685491458946494, + 1.0599569946448246, + 1.038121552545461, + 0.0, + 1.3642741698145529, + 1.3493868683808095, + 1.394942694628328, + 1.572881849642552, + 1.380122665319464, + 1.6636670712582544, + 1.3071999866010466, + 1.546591282841402, + 1.011788365364402, + 1.3642741698145529, + 0.0, + 1.018961640373018, + 1.0114394258945634, + 0.8338711034820684, + 1.1247823842299223, + 1.2651744044972217, + 1.4988622861692171, + 1.147526531928459, + 1.3907391109256988, + 1.3493868683808095, + 1.018961640373018, + 0.0, + 0.7701238110357329, + 1.245486437864406, + 0.5551259549534626, + 1.1938329352055201, + 0.9698559287406783, + 1.447002179128145, + 1.3128200942311496, + 1.394942694628328, + 1.0114394258945634, + 0.7701238110357329, + 0.0, + 1.1886800117391216, + 1.0083692448135637, + 1.8811409082590185, + 1.4972023224597841, + 1.5982242387673176, + 1.19595706584447, + 1.572881849642552, + 0.8338711034820684, + 1.245486437864406, + 1.1886800117391216, + 0.0, + 1.3661374102525012, + 1.3653115050624267, + 1.5243383567266802, + 1.3112533607072414, + 1.3233328139624725, + 1.380122665319464, + 1.1247823842299223, + 0.5551259549534626, + 1.0083692448135637, + 1.3661374102525012, + 0.0}, + raft::distance::DistanceType::LpUnexpanded, + 2.0}; + +const InputConfiguration input_linf = { + 10, + {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0.0, + 0.9251771844789913, + 0.9036452083899731, + 0.9251771844789913, + 0.8706483735804971, + 0.9251771844789913, + 0.717493881903289, + 0.6920214832303888, + 0.9251771844789913, + 0.9251771844789913, + 0.9251771844789913, + 0.0, + 0.9036452083899731, + 0.8655339692155823, + 0.8706483735804971, + 0.8655339692155823, + 0.8655339692155823, + 0.6329837991017668, + 0.8655339692155823, + 0.8655339692155823, + 0.9036452083899731, + 0.9036452083899731, + 0.0, + 0.7988276152181608, + 0.7028075145996631, + 0.9036452083899731, + 0.9036452083899731, + 0.9036452083899731, + 0.8429599432532096, + 0.9036452083899731, + 0.9251771844789913, + 0.8655339692155823, + 0.7988276152181608, + 0.0, + 0.48376552205293305, + 0.8206394616536681, + 0.8206394616536681, + 0.8206394616536681, + 0.8429599432532096, + 0.8206394616536681, + 0.8706483735804971, + 0.8706483735804971, + 0.7028075145996631, + 0.48376552205293305, + 0.0, + 0.8706483735804971, + 0.8706483735804971, + 0.8706483735804971, + 0.8429599432532096, + 0.8706483735804971, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.0, + 0.8853924473642432, + 0.535821510936138, + 0.6497196601457607, + 0.8853924473642432, + 0.717493881903289, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.0, + 0.5279604218147174, + 0.6658348373853169, + 0.33799874888632914, + 0.6920214832303888, + 0.6329837991017668, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.535821510936138, + 0.5279604218147174, + 0.0, + 0.662579808115858, + 0.5079750812968089, + 0.9251771844789913, + 0.8655339692155823, + 0.8429599432532096, + 0.8429599432532096, + 0.8429599432532096, + 0.6497196601457607, + 0.6658348373853169, + 0.662579808115858, + 0.0, + 0.8429599432532096, + 0.9251771844789913, + 0.8655339692155823, + 0.9036452083899731, + 0.8206394616536681, + 0.8706483735804971, + 0.8853924473642432, + 0.33799874888632914, + 0.5079750812968089, + 0.8429599432532096, + 0.0}, + raft::distance::DistanceType::Linf, + 0.0}; + +const InputConfiguration input_l1 = {4, + {0, 1, 1, 2, 4}, + {3, 2, 0, 1}, // indices + {0.99296, 0.42180, 0.11687, 0.305869}, + { + // dense output + 0.0, + 0.99296, + 1.41476, + 1.415707, + 0.99296, + 0.0, + 0.42180, + 0.42274, + 1.41476, + 0.42180, + 0.0, + 0.84454, + 1.41570, + 0.42274, + 0.84454, + 0.0, + }, + raft::distance::DistanceType::L1, + 0.0}; + // test dense smem strategy -const std::vector< - SparseDistanceCOOSPMVInputs> - inputs_dense_strategy = {{input_inner_product}, {input_l2_unexpanded}, - {input_canberra}, {input_lp_unexpanded}, - {input_linf}, {input_l1}}; +const std::vector> + inputs_dense_strategy = {{input_inner_product}, + {input_l2_unexpanded}, + {input_canberra}, + {input_lp_unexpanded}, + {input_linf}, + {input_l1}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestDenseStrategyF; @@ -660,22 +666,22 @@ INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests, ::testing::ValuesIn(inputs_dense_strategy)); // test hash and chunk strategy -const std::vector> - inputs_hash_strategy = {{input_inner_product}, - {input_inner_product, 0.5, 2}, - {input_l2_unexpanded}, - {input_l2_unexpanded, 0.5, 2}, - {input_canberra}, - {input_canberra, 0.5, 2}, - {input_canberra, 0.5, 6}, - {input_lp_unexpanded}, - {input_lp_unexpanded, 0.5, 2}, - {input_lp_unexpanded, 0.5, 6}, - {input_linf}, - {input_linf, 0.5, 2}, - {input_linf, 0.5, 6}, - {input_l1}, - {input_l1, 0.5, 2}}; +const std::vector> inputs_hash_strategy = { + {input_inner_product}, + {input_inner_product, 0.5, 2}, + {input_l2_unexpanded}, + {input_l2_unexpanded, 0.5, 2}, + {input_canberra}, + {input_canberra, 0.5, 2}, + {input_canberra, 0.5, 6}, + {input_lp_unexpanded}, + {input_lp_unexpanded, 0.5, 2}, + {input_lp_unexpanded, 0.5, 6}, + {input_linf}, + {input_linf, 0.5, 2}, + {input_linf, 0.5, 6}, + {input_l1}, + {input_l1, 0.5, 2}}; typedef SparseDistanceCOOSPMVTest SparseDistanceCOOSPMVTestHashStrategyF; diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu index 3bc562bb68..8538c9cf39 100644 --- a/cpp/test/sparse/distance.cu +++ b/cpp/test/sparse/distance.cu @@ -49,8 +49,8 @@ struct SparseDistanceInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseDistanceInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs& dims) +{ return os; } @@ -59,52 +59,56 @@ class SparseDistanceTest : public ::testing::TestWithParam> { public: SparseDistanceTest() - : params(::testing::TestWithParam< - SparseDistanceInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), dist_config(handle), indptr(0, handle.get_stream()), indices(0, handle.get_stream()), data(0, handle.get_stream()), out_dists(0, handle.get_stream()), - out_dists_ref(0, handle.get_stream()) {} + out_dists_ref(0, handle.get_stream()) + { + } - void SetUp() override { + void SetUp() override + { make_data(); - dist_config.b_nrows = params.indptr_h.size() - 1; - dist_config.b_ncols = params.n_cols; - dist_config.b_nnz = params.indices_h.size(); - dist_config.b_indptr = indptr.data(); + dist_config.b_nrows = params.indptr_h.size() - 1; + dist_config.b_ncols = params.n_cols; + dist_config.b_nnz = params.indices_h.size(); + dist_config.b_indptr = indptr.data(); dist_config.b_indices = indices.data(); - dist_config.b_data = data.data(); - dist_config.a_nrows = params.indptr_h.size() - 1; - dist_config.a_ncols = params.n_cols; - dist_config.a_nnz = params.indices_h.size(); - dist_config.a_indptr = indptr.data(); + dist_config.b_data = data.data(); + dist_config.a_nrows = params.indptr_h.size() - 1; + dist_config.a_ncols = params.n_cols; + dist_config.a_nnz = params.indices_h.size(); + dist_config.a_indptr = indptr.data(); dist_config.a_indices = indices.data(); - dist_config.a_data = data.data(); + dist_config.a_data = data.data(); int out_size = dist_config.a_nrows * dist_config.b_nrows; out_dists.resize(out_size, handle.get_stream()); - pairwiseDistance(out_dists.data(), dist_config, params.metric, - params.metric_arg); + pairwiseDistance(out_dists.data(), dist_config, params.metric, params.metric_arg); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), + void compare() + { + ASSERT_TRUE(devArrMatch(out_dists_ref.data(), + out_dists.data(), params.out_dists_ref_h.size(), CompareApprox(1e-3))); } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; auto stream = handle.get_stream(); indptr.resize(indptr_h.size(), stream); @@ -119,8 +123,10 @@ class SparseDistanceTest out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream); - update_device(out_dists_ref.data(), out_dists_ref_h.data(), - out_dists_ref_h.size(), dist_config.handle.get_stream()); + update_device(out_dists_ref.data(), + out_dists_ref_h.data(), + out_dists_ref_h.size(), + dist_config.handle.get_stream()); } raft::handle_t handle; @@ -182,8 +188,7 @@ const std::vector> inputs_i32_f = { {0, 2, 4, 6, 8}, {0, 1, 0, 1, 0, 1, 0, 1}, {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}, - {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, - 5.0}, + {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0}, raft::distance::DistanceType::InnerProduct, 0.0}, {2, @@ -214,40 +219,33 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, - {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, - 0.58146987, 0.44940102, 1., 0.76978799, 0.39419924, 0., - 0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481, - 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, - 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., - 0.79593037, 0.48904013, 0.51413997, 0., 0.28605559, 0.35772784, - 1., 0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801, - 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, - 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, - 0.58623212, 0., 0.77917274, 0.48390993, 0.24558392, 0.99166225, - 0.58146987, 0.73323749, 0.67534399, 1., 0.6745457, 0.77917274, - 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, - 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., - 0.51360432, 0.68185144, 1., 0.54847744, 0.8321819, 0.43324829, - 0.67676228, 0.24558392, 0.76064776, 0.51360432, 0., 1., - 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0., 0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102, + 1., 0.76978799, 0.39419924, 0., 0.97577154, 0.48904013, 0.48300801, 0.45087445, + 0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0., 0.51413997, + 0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819, 1., 0.79593037, 0.48904013, + 0.51413997, 0., 0.28605559, 0.35772784, 1., 0.60889396, 0.43324829, 0.84923694, + 0.45658883, 0.48300801, 0.31195441, 0.28605559, 0., 0.58623212, 0.6745457, 0.60287165, + 0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0., + 0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1., + 0.6745457, 0.77917274, 0., 0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481, + 0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0., 0.51360432, 0.68185144, + 1., 0.54847744, 0.8321819, 0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432, + 0., 1., 0.76978799, 0.78021386, 1., 0.84923694, 0.73155632, 0.99166225, 0.61547536, 0.68185144, 1., 0.}, raft::distance::DistanceType::CosineExpanded, 0.0}, {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, @@ -356,15 +354,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 3.3954660629919076, 5.6469232737388815, @@ -470,15 +466,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 1.31462855332296, 1.3690307816129905, @@ -584,15 +578,13 @@ const std::vector> inputs_i32_f = { {10, {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50}, - {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, - 3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5, - 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices - {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, - 0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, - 0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, - 0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, - 0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, - 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, + {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4, + 6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9}, // indices + {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167, + 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625, + 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853, + 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228, 0.5279, 0.4885, 0.3495, 0.5079, + 0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218}, {0.0, 0.9251771844789913, 0.9036452083899731, @@ -698,17 +690,14 @@ const std::vector> inputs_i32_f = { {15, {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45}, - {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, - 0, 3, 7, 8, 12, 0, 2, 5, 7, 8, 14, 4, 9, 10, 11, - 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, - {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, - 0.73789274, 0.08450219, 1., 0.20184723, 0.18036963, 0.12581403, - 0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555, - 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, - 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, - 0.15605804, 0.3867739, 0.24908977, 0.36413632, 0.37643732, 0.28910679, - 0.0198409, 0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969, - 0.26190054, 0.2077349, 0.10803964}, + {0, 1, 5, 6, 9, 1, 4, 14, 7, 3, 4, 7, 9, 11, 14, 0, 3, 7, 8, 12, 0, 2, 5, + 7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8, 9, 0, 2, 3, 4, 6, 10, 11}, + {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219, + 1., 0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246, + 0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026, 0.05279589, 0.1387149, 0.05962761, + 0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739, + 0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409, 0.31461499, 0.24412279, 0.08327667, + 0.04444576, 0.05047969, 0.26190054, 0.2077349, 0.10803964}, {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01, 9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00, 6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08, @@ -767,31 +756,25 @@ const std::vector> inputs_i32_f = { {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45}, {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4}, - {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, - 0.99584118, 0.76835667, 0.34426657, 0.2357925, 0.01274851, 0.11422017, - 0.3437756, 0.31967718, 0.5956055, 0.31610373, 0.04147273, 0.03724415, - 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, - 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, - 0.61364678, 0.22837736, 0.56609561, 0.29809423, 0.76736686, 0.56460608, - 0.98165371, 0.02140123, 0.19881268, 0.26057815, 0.31648823, 0.89874295, - 0.27366735, 0.5119944, 0.11416134}, + {0.70862347, 0.8232774, 0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667, + 0.34426657, 0.2357925, 0.01274851, 0.11422017, 0.3437756, 0.31967718, 0.5956055, 0.31610373, + 0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529, + 0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736, + 0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815, + 0.31648823, 0.89874295, 0.27366735, 0.5119944, 0.11416134}, {// dense output - 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, - 0.76962708, 1.122858, 1.1232498, 1.08166081, 0.48769777, 0., - 1.31332116, 0.98318907, 0.42661815, 0.09279052, 1.35187836, 1.38429055, - 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, - 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, - 0.26127048, 0.98318907, 1.82943642, 0., 0.29945563, 1.08494093, - 0.22934281, 0.82801925, 1.74288748, 1.50610116, 0.26657011, 0.42661815, - 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, - 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, - 0.45060069, 0., 1.29899154, 1.40683824, 0.48505269, 0.53862363, - 0.76962708, 1.35187836, 1.59360067, 0.22934281, 0.77814948, 1.29899154, - 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, - 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., - 1.47318624, 1.92660889, 1.1232498, 0.40658897, 0.60215168, 1.74288748, - 1.18328348, 0.48505269, 1.92108999, 1.47318624, 0., 0.24992619, - 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, + 0., 0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794, 0.76962708, 1.122858, + 1.1232498, 1.08166081, 0.48769777, 0., 1.31332116, 0.98318907, 0.42661815, 0.09279052, + 1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0., 1.82943642, + 1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907, + 1.82943642, 0., 0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116, + 0.26657011, 0.42661815, 1.54826077, 0.29945563, 0., 0.45060069, 0.77814948, 1.45245711, + 1.18328348, 0.82486987, 0.7874794, 0.09279052, 1.05918884, 1.08494093, 0.45060069, 0., + 1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281, + 0.77814948, 1.29899154, 0., 0.33202426, 1.92108999, 1.88812175, 1.122858, 1.38429055, + 1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0., 1.47318624, 1.92660889, + 1.1232498, 0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624, + 0., 0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363, 1.88812175, 1.92660889, 0.24992619, 0.}, raft::distance::DistanceType::CorrelationExpanded, 0.0}, @@ -800,12 +783,11 @@ const std::vector> inputs_i32_f = { {1, 4, 0, 4, 1, 3, 0, 1, 3, 0}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., - 1., 1., 1., 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., - 1., 1., 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., - 1., 1., 1., 1., 0., 1., 0.8, 1., 1., 1., 1., 0.8, 1., 1., 1., 0., 1., - 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., 1., 1., 1., 1., 1., - 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, + 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 0.8, 1., 1., 1., 1., 1., 1., 1., + 1., 0.8, 0., 1., 1., 1., 0.8, 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., + 0.8, 1., 1., 1., 0., 1., 1., 0.8, 1., 1., 1., 1., 1., 1., 1., 0., 1., 0.8, 1., 1., + 1., 1., 0.8, 1., 1., 1., 0., 1., 1., 0.8, 0.8, 1., 1., 1., 0.8, 0.8, 1., 0., 1., 1., + 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.8, 1., 1., 1., 0.8, 1., 1., 0.}, raft::distance::DistanceType::RusselRaoExpanded, 0.0}, {5, @@ -813,13 +795,12 @@ const std::vector> inputs_i32_f = { {0, 3, 4, 4, 2, 3, 0, 2, 3, 2}, {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}, {// dense output - 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, - 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, - 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., - 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., - 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, - 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, 0.2, 0.2, 0.4, 0., 0.2, - 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, + 0., 0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, + 0.6, 0.2, 0., 0.6, 0.4, 0., 0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0., 0.4, 0., + 0.2, 0., 0.4, 0.6, 0.2, 0., 0.4, 0.2, 0.2, 0.2, 0., 0.2, 0.6, 0.8, 0.4, 0.2, 0.2, + 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0., 0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0., 0.2, + 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0., 0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, + 0.2, 0.2, 0.4, 0., 0.2, 0.2, 0., 0.4, 0., 0.2, 0., 0.4, 0.6, 0.2, 0.}, raft::distance::DistanceType::HammingUnexpanded, 0.0}, {3, @@ -863,7 +844,8 @@ const std::vector> inputs_i32_f = { typedef SparseDistanceTest SparseDistanceTestF; TEST_P(SparseDistanceTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF, +INSTANTIATE_TEST_CASE_P(SparseDistanceTests, + SparseDistanceTestF, ::testing::ValuesIn(inputs_i32_f)); }; // namespace distance diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index 58ad9cf803..63245a63b0 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -35,8 +35,7 @@ struct SparseFilterInputs { }; template -class SparseFilterTests - : public ::testing::TestWithParam> { +class SparseFilterTests : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -49,12 +48,13 @@ class SparseFilterTests const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseFilterTests COORemoveZeros; -TEST_P(COORemoveZeros, Result) { +TEST_P(COORemoveZeros, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); params = ::testing::TestWithParam>::GetParam(); - float *in_h_vals = new float[params.nnz]; + float* in_h_vals = new float[params.nnz]; COO in(stream, params.nnz, 5, 5); @@ -67,8 +67,8 @@ TEST_P(COORemoveZeros, Result) { in_h_vals[2] = 0; in_h_vals[3] = 0; - int *in_h_rows = new int[params.nnz]; - int *in_h_cols = new int[params.nnz]; + int* in_h_rows = new int[params.nnz]; + int* in_h_cols = new int[params.nnz]; for (int i = 0; i < params.nnz; i++) { in_h_rows[i] = params.nnz - i - 1; @@ -84,9 +84,9 @@ TEST_P(COORemoveZeros, Result) { int out_rows_ref_h[2] = {0, 3}; int out_cols_ref_h[2] = {4, 1}; - float *out_vals_ref_h = (float *)malloc(2 * sizeof(float)); - out_vals_ref_h[0] = in_h_vals[4]; - out_vals_ref_h[1] = in_h_vals[1]; + float* out_vals_ref_h = (float*)malloc(2 * sizeof(float)); + out_vals_ref_h[0] = in_h_vals[4]; + out_vals_ref_h[1] = in_h_vals[1]; COO out_ref(stream, 2, 5, 5); COO out(stream); @@ -97,12 +97,9 @@ TEST_P(COORemoveZeros, Result) { op::coo_remove_zeros<32, float>(&in, &out, stream); - ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.rows(), out.rows(), 2, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.cols(), out.cols(), 2, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out_ref.vals(), out.vals(), 2, raft::Compare())); CUDA_CHECK(cudaStreamDestroy(stream)); free(out_vals_ref_h); @@ -112,8 +109,7 @@ TEST_P(COORemoveZeros, Result) { delete[] in_h_vals; } -INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, ::testing::ValuesIn(inputsf)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu index 86b3b3d382..a693262193 100644 --- a/cpp/test/sparse/knn.cu +++ b/cpp/test/sparse/knn.cu @@ -48,60 +48,76 @@ struct SparseKNNInputs { int batch_size_index = 2; int batch_size_query = 2; - raft::distance::DistanceType metric = - raft::distance::DistanceType::L2SqrtExpanded; + raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded; }; template -::std::ostream &operator<<(::std::ostream &os, - const SparseKNNInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs& dims) +{ return os; } template -class SparseKNNTest - : public ::testing::TestWithParam> { +class SparseKNNTest : public ::testing::TestWithParam> { public: SparseKNNTest() - : params(::testing::TestWithParam< - SparseKNNInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), indptr(0, handle.get_stream()), indices(0, handle.get_stream()), data(0, handle.get_stream()), out_indices(0, handle.get_stream()), out_dists(0, handle.get_stream()), out_indices_ref(0, handle.get_stream()), - out_dists_ref(0, handle.get_stream()) {} + out_dists_ref(0, handle.get_stream()) + { + } protected: - void SetUp() override { + void SetUp() override + { n_rows = params.indptr_h.size() - 1; - nnz = params.indices_h.size(); - k = params.k; + nnz = params.indices_h.size(); + k = params.k; make_data(); - raft::sparse::selection::brute_force_knn( - indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols, - indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols, - out_indices.data(), out_dists.data(), k, handle, params.batch_size_index, - params.batch_size_query, params.metric); + raft::sparse::selection::brute_force_knn(indptr.data(), + indices.data(), + data.data(), + nnz, + n_rows, + params.n_cols, + indptr.data(), + indices.data(), + data.data(), + nnz, + n_rows, + params.n_cols, + out_indices.data(), + out_dists.data(), + k, + handle, + params.batch_size_index, + params.batch_size_query, + params.metric); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, - CompareApprox(1e-4))); - ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(), - n_rows * k, Compare())); + void compare() + { + ASSERT_TRUE(devArrMatch( + out_dists_ref.data(), out_dists.data(), n_rows * k, CompareApprox(1e-4))); + ASSERT_TRUE( + devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare())); } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; auto stream = handle.get_stream(); indptr.resize(indptr_h.size(), stream); @@ -112,16 +128,15 @@ class SparseKNNTest update_device(indices.data(), indices_h.data(), indices_h.size(), stream); update_device(data.data(), data_h.data(), data_h.size(), stream); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; out_indices_ref.resize(out_indices_ref_h.size(), stream); out_dists_ref.resize(out_dists_ref_h.size(), stream); - update_device(out_indices_ref.data(), out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_dists_ref.data(), out_dists_ref_h.data(), - out_dists_ref_h.size(), stream); + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream); out_dists.resize(n_rows * k, stream); out_indices.resize(n_rows * k, stream); @@ -158,8 +173,7 @@ const std::vector> inputs_i32_f = { raft::distance::DistanceType::L2SqrtExpanded}}; typedef SparseKNNTest SparseKNNTestF; TEST_P(SparseKNNTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, - ::testing::ValuesIn(inputs_i32_f)); +INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection }; // end namespace sparse diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index c2a1c4b93c..1ed017f40a 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -30,8 +30,9 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, - value_idx nnz, value_idx *sum) { +__global__ void assert_symmetry( + value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) +{ int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -51,32 +52,31 @@ struct KNNGraphInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const KNNGraphInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const KNNGraphInputs& dims) +{ return os; } template -class KNNGraphTest - : public ::testing::TestWithParam> { +class KNNGraphTest : public ::testing::TestWithParam> { public: KNNGraphTest() - : params(::testing::TestWithParam< - KNNGraphInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), - X(0, stream) { + X(0, stream) + { X.resize(params.X.size(), stream); } protected: - void SetUp() override { + void SetUp() override + { out = new raft::sparse::COO(stream); update_device(X.data(), params.X.data(), params.X.size(), stream); raft::sparse::selection::knn_graph( - handle, X.data(), params.m, params.n, - raft::distance::DistanceType::L2Unexpanded, *out); + handle, X.data(), params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out); rmm::device_scalar sum(stream); sum.set_value_to_zero_async(stream); @@ -98,7 +98,7 @@ class KNNGraphTest cudaStream_t stream; // input data - raft::sparse::COO *out; + raft::sparse::COO* out; rmm::device_uvector X; @@ -112,13 +112,15 @@ const std::vector> knn_graph_inputs_fint = { {4, 2, {0, 100, 0.01, 0.02, 5000, 10000, -5, -2}, 2}}; typedef KNNGraphTest KNNGraphTestF_int; -TEST_P(KNNGraphTestF_int, Result) { +TEST_P(KNNGraphTestF_int, Result) +{ // nnz should not be larger than twice m * k ASSERT_TRUE(out->nnz <= (params.m * params.k * 2)); ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(KNNGraphTest, KNNGraphTestF_int, +INSTANTIATE_TEST_CASE_P(KNNGraphTest, + KNNGraphTestF_int, ::testing::ValuesIn(knn_graph_inputs_fint)); } // namespace sparse diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu index 6d4af7f016..50401e5b7a 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/sparse/linkage.cu @@ -55,45 +55,44 @@ struct LinkageInputs { * @param b: number of pairs of points that both the clusters have classified differently */ template -__global__ void computeTheNumerator(const T* firstClusterArray, - const T* secondClusterArray, uint64_t size, - uint64_t* a, uint64_t* b) { - //calculating the indices of pairs of datapoints compared by the current thread +__global__ void computeTheNumerator( + const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b) +{ + // calculating the indices of pairs of datapoints compared by the current thread uint64_t j = threadIdx.x + blockIdx.x * blockDim.x; uint64_t i = threadIdx.y + blockIdx.y * blockDim.y; - //thread-local variables to count a and b + // thread-local variables to count a and b uint64_t myA = 0, myB = 0; if (i < size && j < size && j < i) { - //checking if the pair have been classified the same by both the clusters + // checking if the pair have been classified the same by both the clusters if (firstClusterArray[i] == firstClusterArray[j] && secondClusterArray[i] == secondClusterArray[j]) { ++myA; } - //checking if the pair have been classified differently by both the clusters + // checking if the pair have been classified differently by both the clusters else if (firstClusterArray[i] != firstClusterArray[j] && secondClusterArray[i] != secondClusterArray[j]) { ++myB; } } - //specialize blockReduce for a 2D block of 1024 threads of type uint64_t - typedef cub::BlockReduce + // specialize blockReduce for a 2D block of 1024 threads of type uint64_t + typedef cub::BlockReduce BlockReduce; - //Allocate shared memory for blockReduce + // Allocate shared memory for blockReduce __shared__ typename BlockReduce::TempStorage temp_storage; - //summing up thread-local counts specific to a block + // summing up thread-local counts specific to a block myA = BlockReduce(temp_storage).Sum(myA); __syncthreads(); myB = BlockReduce(temp_storage).Sum(myB); __syncthreads(); - //executed once per block + // executed once per block if (threadIdx.x == 0 && threadIdx.y == 0) { raft::myAtomicAdd((unsigned long long int*)a, myA); raft::myAtomicAdd((unsigned long long int*)b, myB); @@ -101,53 +100,54 @@ __global__ void computeTheNumerator(const T* firstClusterArray, } /** -* @brief Function to calculate RandIndex -* more info on rand index -* @param firstClusterArray: the array of classes of type T -* @param secondClusterArray: the array of classes of type T -* @param size: the size of the data points of type uint64_t -* @param stream: the cudaStream object -*/ + * @brief Function to calculate RandIndex + * more info on rand index + * @param firstClusterArray: the array of classes of type T + * @param secondClusterArray: the array of classes of type T + * @param size: the size of the data points of type uint64_t + * @param stream: the cudaStream object + */ template -double compute_rand_index(T* firstClusterArray, T* secondClusterArray, - uint64_t size, cudaStream_t stream) { - //rand index for size less than 2 is not defined +double compute_rand_index(T* firstClusterArray, + T* secondClusterArray, + uint64_t size, + cudaStream_t stream) +{ + // rand index for size less than 2 is not defined ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); - //allocating and initializing memory for a and b in the GPU + // allocating and initializing memory for a and b in the GPU rmm::device_uvector arr_buf(2, stream); CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); - //kernel configuration + // kernel configuration static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y); dim3 numBlocks(raft::ceildiv(size, numThreadsPerBlock.x), raft::ceildiv(size, numThreadsPerBlock.y)); - //calling the kernel - computeTheNumerator - <<>>( - firstClusterArray, secondClusterArray, size, arr_buf.data(), - arr_buf.data() + 1); + // calling the kernel + computeTheNumerator<<>>( + firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1); - //synchronizing and updating the calculated values of a and b from device to host + // synchronizing and updating the calculated values of a and b from device to host uint64_t ab_host[2] = {0}; raft::update_host(ab_host, arr_buf.data(), 2, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - //error handling + // error handling CUDA_CHECK(cudaGetLastError()); - //denominator + // denominator uint64_t nChooseTwo = size * (size - 1) / 2; - //calculating the rand_index + // calculating the rand_index return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo); } template -::std::ostream& operator<<(::std::ostream& os, - const LinkageInputs& dims) { +::std::ostream& operator<<(::std::ostream& os, const LinkageInputs& dims) +{ return os; } @@ -158,15 +158,17 @@ class LinkageTest : public ::testing::TestWithParam> { : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), labels(params.n_row, stream), - labels_ref(params.n_row, stream) {} + labels_ref(params.n_row, stream) + { + } protected: - void basicTest() { + void basicTest() + { rmm::device_uvector data(params.n_row * params.n_col, stream); raft::copy(data.data(), params.data.data(), data.size(), stream); - raft::copy(labels_ref.data(), params.expected_labels.data(), params.n_row, - stream); + raft::copy(labels_ref.data(), params.expected_labels.data(), params.n_row, stream); raft::hierarchy::linkage_output out_arrs; out_arrs.labels = labels.data(); @@ -176,16 +178,19 @@ class LinkageTest : public ::testing::TestWithParam> { out_arrs.children = out_children.data(); raft::handle_t handle; - raft::hierarchy::single_linkage< - IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>( - handle, data.data(), params.n_row, params.n_col, - raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c, + raft::hierarchy::single_linkage( + handle, + data.data(), + params.n_row, + params.n_col, + raft::distance::DistanceType::L2SqrtExpanded, + &out_arrs, + params.c, params.n_clusters); CUDA_CHECK(cudaStreamSynchronize(stream)); - score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row, - stream); + score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row, stream); } void SetUp() override { basicTest(); } @@ -203,14 +208,12 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == n_points {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10, @@ -218,8 +221,7 @@ const std::vector> linkage_inputsf2 = { // // Test outlier points {9, 2, - {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, - 10, 50, 30, 5}, + {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5}, {6, 0, 5, 0, 0, 4, 3, 2, 1}, 7, -1}, @@ -227,14 +229,12 @@ const std::vector> linkage_inputsf2 = { // Test n_clusters == (n_points / 2) {10, 5, - {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, - 0.77782677, 0.43772379, 0.4035871, 0.3282796, 0.47544681, 0.59862974, - 0.12319357, 0.06239463, 0.28200272, 0.1345717, 0.50498218, 0.5113505, - 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, - 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, - 0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792, - 0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692, - 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, + {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379, + 0.4035871, 0.3282796, 0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717, + 0.50498218, 0.5113505, 0.16233086, 0.62165332, 0.42281548, 0.933117, 0.41386077, 0.23264562, + 0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674, 0.84854131, 0.28890216, + 0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554, + 0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396, 0.76166195, 0.66613745}, {1, 0, 4, 0, 0, 3, 2, 0, 2, 1}, 5, @@ -243,340 +243,173 @@ const std::vector> linkage_inputsf2 = { // Test n_points == 100 {100, 10, - {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, - 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, - 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, - 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, - 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, - 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, - 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, - 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, - 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, - 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, - 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, - 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, - 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, - 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, - 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, - 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, - 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, - 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, - 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, - 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, - 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, - 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, - 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, - 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, - 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, - 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, - 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, - 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, - 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, - 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, - 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, - 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, - 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, - 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, - 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, - 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, - 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, - 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, - 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, - 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, - 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, - 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, - 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, - 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, - 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, - 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, - 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, - 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, - 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, - 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, - 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, - 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, - 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, - 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, - 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, - 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, - 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, - 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, - 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, - 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, - 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, - 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, - 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, - 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, - 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, - 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, - 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, - 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, - 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, - 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, - 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, - 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, - 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, - 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, - 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, - 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, - 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, - 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, - 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, - 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, - 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, - 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, - 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, - 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, - 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, - 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, - 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, - 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, - 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, - 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, - 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, - 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, - 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, - 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, - 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, - 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, - 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, - 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, - 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, - 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, - 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, - 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, - 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, - 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, - 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, - 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, - 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, - 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, - 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, - 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, - 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, - 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, - 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, - 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, - 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, - 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, - 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, - 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, - 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, - 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, - 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, - 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, - 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, - 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, - 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, - 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, - 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, - 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, - 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, - 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, - 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, - 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, - 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, - 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, - 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, - 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, - 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, - 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, - 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, - 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, - 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, - 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, - 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, - 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, - 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, - 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, - 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, - 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, - 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, - 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, - 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, - 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, - 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, - 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, - 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, - 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, - 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, - 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, - 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, - 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, - 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, - 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, - 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, - 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, - 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, - 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, - 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, - 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, - 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, - 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, - 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, - 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, - 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, - 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, - 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, - 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, - 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, - 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, - 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, - 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, - 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, - 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, - 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, - 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, - 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, - 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, - 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, - 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, - 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, - 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, - 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, - 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, - 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, - 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, - 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, - 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, - 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, - 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, - 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, - 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, - 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, - 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, - 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, - 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, - 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, - 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, - 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, - 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, - 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, - 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, - 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, - 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, - 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, - 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, - 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, - 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, - 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, - 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, - 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, - 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, - 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, - 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, - 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, - 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, - 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, - 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, - 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, - 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, - 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, - 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, - 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, - 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, - 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, - 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, - 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, - 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, - 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, - 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, - 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, - 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, - 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, - 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, - 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, - 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, - 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, - 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, - 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, - 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, - 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, - 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, - 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, - 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, - 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, - 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, - 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, - 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, - 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, - 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, - 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, - 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, - 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, - 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, - 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, - 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, - 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, - 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, - 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, - 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, - 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, - 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, - 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, - 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, - 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, - 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, - 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, - 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, - 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, - 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, - 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, - 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, - 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, - 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, - 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, - 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, - 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, - 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, - 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, - 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, - 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, - 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, - 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, - 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, - 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, - 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, - 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, - 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, - 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, - 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, - 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, - 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, - 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, - 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, - 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, - 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, - 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, - 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, - 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, - 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, - 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, - 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, - 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, - 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, - 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, - 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, - 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, - 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, - 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, - 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, - 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, - 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, - 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, - 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, - 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, - 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, - 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, - 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, - 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, - 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, - 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, - 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, - 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, - 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, - 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, - 8.66342445e-01 + {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01, + 6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01, + 9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01, + 7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01, + 3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01, + 9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01, + 7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01, + 4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01, + 3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01, + 9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02, + 8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01, + 5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01, + 8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01, + 6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01, + 1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01, + 5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01, + 9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01, + 6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01, + 9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02, + 1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01, + 8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01, + 2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01, + 1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01, + 5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02, + 4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02, + 6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01, + 9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01, + 7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01, + 7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01, + 5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01, + 1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01, + 6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01, + 3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01, + 3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01, + 4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01, + 2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01, + 7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01, + 5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01, + 7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02, + 2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01, + 7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01, + 1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01, + 9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02, + 2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01, + 4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01, + 5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01, + 6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01, + 4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01, + 5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01, + 9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01, + 1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01, + 9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01, + 3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01, + 2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01, + 1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01, + 2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01, + 2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01, + 8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01, + 9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01, + 9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01, + 9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01, + 8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01, + 4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01, + 1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01, + 3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01, + 5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01, + 1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01, + 8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01, + 1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02, + 6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02, + 8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01, + 5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01, + 3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01, + 1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01, + 2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01, + 6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01, + 6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01, + 6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01, + 3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01, + 1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01, + 9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01, + 9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01, + 3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01, + 1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01, + 9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01, + 9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01, + 2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01, + 3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01, + 3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01, + 5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01, + 6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03, + 3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01, + 1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01, + 2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01, + 4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01, + 1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01, + 8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01, + 8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02, + 9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01, + 6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01, + 7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02, + 8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02, + 5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01, + 7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01, + 1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01, + 8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01, + 1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01, + 3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01, + 9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01, + 2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02, + 6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01, + 5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01, + 2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01, + 7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01, + 4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01, + 9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01, + 2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01, + 5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01, + 4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01, + 4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01, + 8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01, + 7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03, + 4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02, + 1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01, + 2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01, + 9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01, + 1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01, + 3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01, + 3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01, + 7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01, + 8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01, + 5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01, + 8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01, + 4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01, + 7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01, + 4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01, + 7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01, + 1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01, + 6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01, + 9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02, + 1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02, + 8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01, + 9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01, + 4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01, + 7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01, + 1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01, + 2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01, + 7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01, + 7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01, + 3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01, + 7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01, + 2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01, + 2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01, + 9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01, + 4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01, + 4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01, + 5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01, + 3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01, + 9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01, + 4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01, + 1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01, + 3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01, + 4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01, + 8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01, + 5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02, + 4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01, + 1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01 }, {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -589,6 +422,5 @@ const std::vector> linkage_inputsf2 = { typedef LinkageTest LinkageTestF_Int; TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); } -INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, - ::testing::ValuesIn(linkage_inputsf2)); +INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, ::testing::ValuesIn(linkage_inputsf2)); } // end namespace raft diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index 4900b3ff2b..3cf465e032 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -39,24 +39,25 @@ struct CSRRowNormalizeInputs { }; template -class CSRRowNormalizeTest - : public ::testing::TestWithParam> { +class CSRRowNormalizeTest : public ::testing::TestWithParam> { public: CSRRowNormalizeTest() - : params(::testing::TestWithParam< - CSRRowNormalizeInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in_vals(params.in_vals.size(), stream), verify(params.verify.size(), stream), ex_scan(params.ex_scan.size(), stream), - result(params.verify.size(), stream) {} + result(params.verify.size(), stream) + { + } protected: void SetUp() override {} - void Run() { + void Run() + { Index_ n_rows = params.ex_scan.size(); - Index_ nnz = params.in_vals.size(); + Index_ nnz = params.in_vals.size(); raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream); raft::update_device(in_vals.data(), params.in_vals.data(), nnz, stream); @@ -73,8 +74,8 @@ class CSRRowNormalizeTest break; } - ASSERT_TRUE(raft::devArrMatch(verify.data(), result.data(), nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), result.data(), nnz, raft::Compare())); } protected: @@ -113,9 +114,11 @@ const std::vector> csrnormalize_inputs_d = { {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF, +INSTANTIATE_TEST_CASE_P(SparseNormTest, + CSRRowNormalizeTestF, ::testing::ValuesIn(csrnormalize_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD, +INSTANTIATE_TEST_CASE_P(SparseNormTest, + CSRRowNormalizeTestD, ::testing::ValuesIn(csrnormalize_inputs_d)); } // namespace sparse diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu index 8ff4a600bc..9a27ae5134 100644 --- a/cpp/test/sparse/reduce.cu +++ b/cpp/test/sparse/reduce.cu @@ -42,15 +42,15 @@ struct SparseReduceInputs { }; template -class SparseReduceTest - : public ::testing::TestWithParam> { +class SparseReduceTest : public ::testing::TestWithParam> { protected: - void SetUp() override { - params = ::testing::TestWithParam< - SparseReduceInputs>::GetParam(); + void SetUp() override + { + params = ::testing::TestWithParam>::GetParam(); } - void Run() { + void Run() + { raft::handle_t handle; auto stream = handle.get_stream(); @@ -62,30 +62,29 @@ class SparseReduceTest rmm::device_uvector out_cols(params.out_cols.size(), stream); rmm::device_uvector out_vals(params.out_vals.size(), stream); - raft::update_device(in_rows.data(), params.in_rows.data(), - params.in_rows.size(), stream); - raft::update_device(in_cols.data(), params.in_cols.data(), - params.in_cols.size(), stream); - raft::update_device(in_vals.data(), params.in_vals.data(), - params.in_vals.size(), stream); - raft::update_device(out_rows.data(), params.out_rows.data(), - params.out_rows.size(), stream); - raft::update_device(out_cols.data(), params.out_cols.data(), - params.out_cols.size(), stream); - raft::update_device(out_vals.data(), params.out_vals.data(), - params.out_vals.size(), stream); + raft::update_device(in_rows.data(), params.in_rows.data(), params.in_rows.size(), stream); + raft::update_device(in_cols.data(), params.in_cols.data(), params.in_cols.size(), stream); + raft::update_device(in_vals.data(), params.in_vals.data(), params.in_vals.size(), stream); + raft::update_device(out_rows.data(), params.out_rows.data(), params.out_rows.size(), stream); + raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream); + raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream); raft::sparse::COO out(stream); - raft::sparse::op::max_duplicates(handle, out, in_rows.data(), - in_cols.data(), in_vals.data(), - params.in_rows.size(), params.m, params.n); + raft::sparse::op::max_duplicates(handle, + out, + in_rows.data(), + in_cols.data(), + in_vals.data(), + params.in_rows.size(), + params.m, + params.n); ASSERT_TRUE(raft::devArrMatch( out_rows.data(), out.rows(), out.nnz, raft::Compare())); ASSERT_TRUE(raft::devArrMatch( out_cols.data(), out.cols(), out.nnz, raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(out_vals.data(), out.vals(), out.nnz, raft::Compare())); } void TearDown() override {} @@ -114,7 +113,8 @@ const std::vector> max_reduce_inputs_f = { 4}, }; -INSTANTIATE_TEST_CASE_P(SparseReduceTest, SparseReduceTestF, +INSTANTIATE_TEST_CASE_P(SparseReduceTest, + SparseReduceTestF, ::testing::ValuesIn(max_reduce_inputs_f)); } // namespace sparse diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index d527e7323e..d73288b9f6 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -38,43 +38,48 @@ struct CSRRowOpInputs { /** Wrapper to call csr_row_op because the enclosing function of a __device__ * lambda cannot have private ot protected access within the class. */ template -void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz, - Type_f *result, cudaStream_t stream) { +void csr_row_op_wrapper( + const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream) +{ op::csr_row_op( - row_ind, n_rows, nnz, + row_ind, + n_rows, + nnz, [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) { - for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row; + for (Index_ i = start_idx; i < stop_idx; i++) + result[i] = row; }, stream); } template -class CSRRowOpTest - : public ::testing::TestWithParam> { +class CSRRowOpTest : public ::testing::TestWithParam> { public: CSRRowOpTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), verify(params.verify.size(), stream), ex_scan(params.ex_scan.size(), stream), - result(params.verify.size(), stream) {} + result(params.verify.size(), stream) + { + } protected: - void SetUp() override { + void SetUp() override + { n_rows = params.ex_scan.size(); - nnz = params.verify.size(); + nnz = params.verify.size(); } - void Run() { + void Run() + { raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream); raft::update_device(verify.data(), params.verify.data(), nnz, stream); - csr_row_op_wrapper(ex_scan.data(), n_rows, nnz, - result.data(), stream); + csr_row_op_wrapper(ex_scan.data(), n_rows, nnz, result.data(), stream); - ASSERT_TRUE(raft::devArrMatch(verify.data(), result.data(), nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), result.data(), nnz, raft::Compare())); } protected: @@ -100,10 +105,8 @@ const std::vector> csrrowop_inputs_d = { {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}}, }; -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, - ::testing::ValuesIn(csrrowop_inputs_f)); -INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, - ::testing::ValuesIn(csrrowop_inputs_d)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, ::testing::ValuesIn(csrrowop_inputs_f)); +INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, ::testing::ValuesIn(csrrowop_inputs_d)); } // namespace sparse } // namespace raft diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index 7d43780cfd..c7cd03b485 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -46,7 +46,8 @@ class SparseSortTest : public ::testing::TestWithParam> { const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef SparseSortTest COOSort; -TEST_P(COOSort, Result) { +TEST_P(COOSort, Result) +{ params = ::testing::TestWithParam>::GetParam(); raft::random::Rng r(params.seed); cudaStream_t stream; @@ -59,13 +60,13 @@ TEST_P(COOSort, Result) { r.uniform(in_vals.data(), params.nnz, float(-1.0), float(1.0), stream); - int *in_rows_h = (int *)malloc(params.nnz * sizeof(int)); - int *in_cols_h = (int *)malloc(params.nnz * sizeof(int)); - int *verify_h = (int *)malloc(params.nnz * sizeof(int)); + int* in_rows_h = (int*)malloc(params.nnz * sizeof(int)); + int* in_cols_h = (int*)malloc(params.nnz * sizeof(int)); + int* verify_h = (int*)malloc(params.nnz * sizeof(int)); for (int i = 0; i < params.nnz; i++) { in_rows_h[i] = params.nnz - i - 1; - verify_h[i] = i; + verify_h[i] = i; in_cols_h[i] = i; } @@ -74,11 +75,11 @@ TEST_P(COOSort, Result) { raft::update_device(in_cols.data(), in_cols_h, params.nnz, stream); raft::update_device(verify.data(), verify_h, params.nnz, stream); - op::coo_sort(params.m, params.n, params.nnz, in_rows.data(), in_cols.data(), - in_vals.data(), stream); + op::coo_sort( + params.m, params.n, params.nnz, in_rows.data(), in_cols.data(), in_vals.data(), stream); - ASSERT_TRUE(raft::devArrMatch(verify.data(), in_rows.data(), params.nnz, - raft::Compare())); + ASSERT_TRUE( + raft::devArrMatch(verify.data(), in_rows.data(), params.nnz, raft::Compare())); delete[] in_rows_h; delete[] in_cols_h; diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index 77d9d3d822..53bea0ddc0 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -31,8 +31,9 @@ namespace raft { namespace sparse { template -__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals, - value_idx nnz, value_idx *sum) { +__global__ void assert_symmetry( + value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum) +{ int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid >= nnz) return; @@ -51,28 +52,31 @@ struct SparseSymmetrizeInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseSymmetrizeInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseSymmetrizeInputs& dims) +{ return os; } template -class SparseSymmetrizeTest : public ::testing::TestWithParam< - SparseSymmetrizeInputs> { +class SparseSymmetrizeTest + : public ::testing::TestWithParam> { public: SparseSymmetrizeTest() - : params(::testing::TestWithParam< - SparseSymmetrizeInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), indptr(0, stream), indices(0, stream), - data(0, stream) {} + data(0, stream) + { + } protected: - void make_data() { - std::vector indptr_h = params.indptr_h; + void make_data() + { + std::vector indptr_h = params.indptr_h; std::vector indices_h = params.indices_h; - std::vector data_h = params.data_h; + std::vector data_h = params.data_h; indptr.resize(indptr_h.size(), stream); indices.resize(indices_h.size(), stream); @@ -83,22 +87,22 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam< update_device(data.data(), data_h.data(), data_h.size(), stream); } - void SetUp() override { + void SetUp() override + { make_data(); - value_idx m = params.indptr_h.size() - 1; - value_idx n = params.n_cols; + value_idx m = params.indptr_h.size() - 1; + value_idx n = params.n_cols; value_idx nnz = params.indices_h.size(); rmm::device_uvector coo_rows(nnz, stream); - raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz, - stream); + raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz, stream); raft::sparse::COO out(stream); - raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices.data(), - data.data(), m, n, coo_rows.size(), out); + raft::sparse::linalg::symmetrize( + handle, coo_rows.data(), indices.data(), data.data(), m, n, coo_rows.size(), out); rmm::device_scalar sum(stream); sum.set_value_to_zero_async(stream); @@ -130,8 +134,7 @@ struct COOSymmetrizeInputs { }; template -class COOSymmetrizeTest - : public ::testing::TestWithParam> { +class COOSymmetrizeTest : public ::testing::TestWithParam> { protected: void SetUp() override {} @@ -141,22 +144,21 @@ class COOSymmetrizeTest const std::vector> inputsf = {{5, 10, 5, 1234ULL}}; typedef COOSymmetrizeTest COOSymmetrize; -TEST_P(COOSymmetrize, Result) { +TEST_P(COOSymmetrize, Result) +{ cudaStream_t stream; cudaStreamCreate(&stream); int nnz = 8; - int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; - int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; - float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; + int* in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3}; + int* in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2}; + float* in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5}; - int *exp_rows_h = - new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; - int *exp_cols_h = - new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; - float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, - 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; + int* exp_rows_h = new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0}; + int* exp_cols_h = new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0}; + float* exp_vals_h = + new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0}; COO in(stream, nnz, 4, 4); raft::update_device(in.rows(), *&in_rows_h, nnz, stream); @@ -166,22 +168,18 @@ TEST_P(COOSymmetrize, Result) { COO out(stream); linalg::coo_symmetrize<32, float>( - &in, &out, - [] __device__(int row, int col, float val, float trans) { - return val + trans; - }, + &in, + &out, + [] __device__(int row, int col, float val, float trans) { return val + trans; }, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); std::cout << out << std::endl; ASSERT_TRUE(out.nnz == nnz * 2); - ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, - raft::Compare())); - ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, - raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.rows(), exp_rows_h, out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.cols(), exp_cols_h, out.nnz, raft::Compare())); + ASSERT_TRUE(raft::devArrMatch(out.vals(), exp_vals_h, out.nnz, raft::Compare())); cudaStreamDestroy(stream); @@ -194,8 +192,7 @@ TEST_P(COOSymmetrize, Result) { delete[] exp_vals_h; } -INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, ::testing::ValuesIn(inputsf)); const std::vector> symm_inputs_fint = { // Test n_clusters == n_points @@ -215,7 +212,8 @@ const std::vector> symm_inputs_fint = { typedef SparseSymmetrizeTest SparseSymmetrizeTestF_int; TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); } -INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, SparseSymmetrizeTestF_int, +INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, + SparseSymmetrizeTestF_int, ::testing::ValuesIn(symm_inputs_fint)); } // namespace sparse diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu index ca30506df0..ab85e7fe8f 100644 --- a/cpp/test/spatial/ball_cover.cu +++ b/cpp/test/spatial/ball_cover.cu @@ -37,21 +37,26 @@ namespace knn { using namespace std; template -__global__ void count_discrepancies_kernel(value_idx *actual_idx, - value_idx *expected_idx, - value_t *actual, value_t *expected, - uint32_t m, uint32_t n, - uint32_t *out, float thres = 1e-3) { +__global__ void count_discrepancies_kernel(value_idx* actual_idx, + value_idx* expected_idx, + value_t* actual, + value_t* expected, + uint32_t m, + uint32_t n, + uint32_t* out, + float thres = 1e-3) +{ uint32_t row = blockDim.x * blockIdx.x + threadIdx.x; int n_diffs = 0; if (row < m) { for (uint32_t i = 0; i < n; i++) { - value_t d = actual[row * n + i] - expected[row * n + i]; + value_t d = actual[row * n + i] - expected[row * n + i]; bool matches = fabsf(d) <= thres; if (!matches) { // printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld, expected=%f\n", - // row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i], expected[row*n+i]); + // row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i], + // expected[row*n+i]); } n_diffs += !matches; @@ -61,13 +66,19 @@ __global__ void count_discrepancies_kernel(value_idx *actual_idx, } struct is_nonzero { - __host__ __device__ bool operator()(uint32_t &i) { return i > 0; } + __host__ __device__ bool operator()(uint32_t& i) { return i > 0; } }; template -uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx, - value_t *actual, value_t *expected, uint32_t m, - uint32_t n, uint32_t *out, cudaStream_t stream) { +uint32_t count_discrepancies(value_idx* actual_idx, + value_idx* expected_idx, + value_t* actual, + value_t* expected, + uint32_t m, + uint32_t n, + uint32_t* out, + cudaStream_t stream) +{ uint32_t tpb = 256; count_discrepancies_kernel<<>>( actual_idx, expected_idx, actual, expected, m, n, out); @@ -79,25 +90,41 @@ uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx, } template -void compute_bfknn(const raft::handle_t &handle, const value_t *X1, - const value_t *X2, uint32_t n, uint32_t d, uint32_t k, - const raft::distance::DistanceType metric, value_t *dists, - int64_t *inds) { - std::vector input_vec = {const_cast(X1)}; +void compute_bfknn(const raft::handle_t& handle, + const value_t* X1, + const value_t* X2, + uint32_t n, + uint32_t d, + uint32_t k, + const raft::distance::DistanceType metric, + value_t* dists, + int64_t* inds) +{ + std::vector input_vec = {const_cast(X1)}; std::vector sizes_vec = {n}; - cudaStream_t *int_streams = nullptr; - std::vector *translations = nullptr; - - raft::spatial::knn::detail::brute_force_knn_impl( - input_vec, sizes_vec, d, const_cast(X2), n, inds, dists, k, - handle.get_stream(), int_streams, 0, true, true, translations, metric); + cudaStream_t* int_streams = nullptr; + std::vector* translations = nullptr; + + raft::spatial::knn::detail::brute_force_knn_impl(input_vec, + sizes_vec, + d, + const_cast(X2), + n, + inds, + dists, + k, + handle.get_stream(), + int_streams, + 0, + true, + true, + translations, + metric); } struct ToRadians { - __device__ __host__ float operator()(float a) { - return a * (CUDART_PI_F / 180.0); - } + __device__ __host__ float operator()(float a) { return a * (CUDART_PI_F / 180.0); } }; struct BallCoverInputs { @@ -109,13 +136,14 @@ struct BallCoverInputs { template class BallCoverKNNQueryTest : public ::testing::TestWithParam { protected: - void basicTest() { + void basicTest() + { params = ::testing::TestWithParam::GetParam(); raft::handle_t handle; - uint32_t k = params.k; + uint32_t k = params.k; float weight = params.weight; - auto metric = params.metric; + auto metric = params.metric; std::vector h_train_inputs = spatial_data; @@ -126,17 +154,25 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { // Allocate input rmm::device_uvector d_train_inputs(n * d, handle.get_stream()); - raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, - handle.get_stream()); + raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream()); if (metric == raft::distance::DistanceType::Haversine) { - thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(), + thrust::transform(handle.get_thrust_policy(), + d_train_inputs.data(), d_train_inputs.data() + d_train_inputs.size(), - d_train_inputs.data(), ToRadians()); + d_train_inputs.data(), + ToRadians()); } - compute_bfknn(handle, d_train_inputs.data(), d_train_inputs.data(), n, d, k, - metric, d_ref_D.data(), d_ref_I.data()); + compute_bfknn(handle, + d_train_inputs.data(), + d_train_inputs.data(), + n, + d, + k, + metric, + d_ref_D.data(), + d_ref_I.data()); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); @@ -144,13 +180,11 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { rmm::device_uvector d_pred_I(n * k, handle.get_stream()); rmm::device_uvector d_pred_D(n * k, handle.get_stream()); - BallCoverIndex index(handle, d_train_inputs.data(), n, - d, metric); + BallCoverIndex index(handle, d_train_inputs.data(), n, d, metric); raft::spatial::knn::rbc_build_index(handle, index); - raft::spatial::knn::rbc_knn_query(handle, index, k, d_train_inputs.data(), - n, d_pred_I.data(), d_pred_D.data(), true, - weight); + raft::spatial::knn::rbc_knn_query( + handle, index, k, d_train_inputs.data(), n, d_pred_I.data(), d_pred_D.data(), true, weight); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); // What we really want are for the distances to match exactly. The @@ -158,12 +192,19 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { // can be nondeterministic. rmm::device_uvector discrepancies(n, handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), discrepancies.data(), - discrepancies.data() + discrepancies.size(), 0); + thrust::fill(handle.get_thrust_policy(), + discrepancies.data(), + discrepancies.data() + discrepancies.size(), + 0); // - int res = count_discrepancies(d_ref_I.data(), d_pred_I.data(), - d_ref_D.data(), d_pred_D.data(), n, k, - discrepancies.data(), handle.get_stream()); + int res = count_discrepancies(d_ref_I.data(), + d_pred_I.data(), + d_ref_D.data(), + d_pred_D.data(), + n, + k, + discrepancies.data(), + handle.get_stream()); ASSERT_TRUE(res == 0); } @@ -180,13 +221,14 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam { template class BallCoverAllKNNTest : public ::testing::TestWithParam { protected: - void basicTest() { + void basicTest() + { params = ::testing::TestWithParam::GetParam(); raft::handle_t handle; - uint32_t k = params.k; + uint32_t k = params.k; float weight = params.weight; - auto metric = params.metric; + auto metric = params.metric; std::vector h_train_inputs = spatial_data; @@ -197,25 +239,37 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam { // Allocate input rmm::device_uvector d_train_inputs(n * d, handle.get_stream()); - raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, - handle.get_stream()); + raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream()); if (metric == raft::distance::DistanceType::Haversine) { - thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(), + thrust::transform(handle.get_thrust_policy(), + d_train_inputs.data(), d_train_inputs.data() + d_train_inputs.size(), - d_train_inputs.data(), ToRadians()); + d_train_inputs.data(), + ToRadians()); } - cudaStream_t *int_streams = nullptr; - std::vector *translations = nullptr; + cudaStream_t* int_streams = nullptr; + std::vector* translations = nullptr; - std::vector input_vec = {d_train_inputs.data()}; + std::vector input_vec = {d_train_inputs.data()}; std::vector sizes_vec = {n}; - raft::spatial::knn::detail::brute_force_knn_impl( - input_vec, sizes_vec, d, d_train_inputs.data(), n, d_ref_I.data(), - d_ref_D.data(), k, handle.get_stream(), int_streams, 0, true, true, - translations, metric); + raft::spatial::knn::detail::brute_force_knn_impl(input_vec, + sizes_vec, + d, + d_train_inputs.data(), + n, + d_ref_I.data(), + d_ref_D.data(), + k, + handle.get_stream(), + int_streams, + 0, + true, + true, + translations, + metric); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); @@ -223,11 +277,10 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam { rmm::device_uvector d_pred_I(n * k, handle.get_stream()); rmm::device_uvector d_pred_D(n * k, handle.get_stream()); - BallCoverIndex index(handle, d_train_inputs.data(), n, - d, metric); + BallCoverIndex index(handle, d_train_inputs.data(), n, d, metric); - raft::spatial::knn::rbc_all_knn_query(handle, index, k, d_pred_I.data(), - d_pred_D.data(), true, weight); + raft::spatial::knn::rbc_all_knn_query( + handle, index, k, d_pred_I.data(), d_pred_D.data(), true, weight); CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); // What we really want are for the distances to match exactly. The @@ -235,12 +288,19 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam { // can be nondeterministic. rmm::device_uvector discrepancies(n, handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), discrepancies.data(), - discrepancies.data() + discrepancies.size(), 0); + thrust::fill(handle.get_thrust_policy(), + discrepancies.data(), + discrepancies.data() + discrepancies.size(), + 0); // - uint32_t res = count_discrepancies( - d_ref_I.data(), d_pred_I.data(), d_ref_D.data(), d_pred_D.data(), n, k, - discrepancies.data(), handle.get_stream()); + uint32_t res = count_discrepancies(d_ref_I.data(), + d_pred_I.data(), + d_ref_D.data(), + d_pred_D.data(), + n, + k, + discrepancies.data(), + handle.get_stream()); ASSERT_TRUE(res == 0); } @@ -265,9 +325,11 @@ const std::vector ballcover_inputs = { {7, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded}, }; -INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest, BallCoverAllKNNTestF, +INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest, + BallCoverAllKNNTestF, ::testing::ValuesIn(ballcover_inputs)); -INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest, BallCoverKNNQueryTestF, +INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest, + BallCoverKNNQueryTestF, ::testing::ValuesIn(ballcover_inputs)); TEST_P(BallCoverAllKNNTestF, Fit) { basicTest(); } diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu index 4930b47e0c..e48a3c6657 100644 --- a/cpp/test/spatial/fused_l2_knn.cu +++ b/cpp/test/spatial/fused_l2_knn.cu @@ -49,20 +49,25 @@ struct idx_dist_pair { IdxT idx; DistT dist; compareDist eq_compare; - bool operator==(const idx_dist_pair &a) const { + bool operator==(const idx_dist_pair& a) const + { if (idx == a.idx) return true; if (eq_compare(dist, a.dist)) return true; return false; } - idx_dist_pair(IdxT x, DistT y, compareDist op) - : idx(x), dist(y), eq_compare(op) {} + idx_dist_pair(IdxT x, DistT y, compareDist op) : idx(x), dist(y), eq_compare(op) {} }; template -testing::AssertionResult devArrMatchKnnPair( - const T *expected_idx, const T *actual_idx, const DistT *expected_dist, - const DistT *actual_dist, size_t rows, size_t cols, const DistT eps, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatchKnnPair(const T* expected_idx, + const T* actual_idx, + const DistT* expected_dist, + const DistT* actual_dist, + size_t rows, + size_t cols, + const DistT eps, + cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr exp_idx_h(new T[size]); std::unique_ptr act_idx_h(new T[size]); @@ -75,9 +80,9 @@ testing::AssertionResult devArrMatchKnnPair( CUDA_CHECK(cudaStreamSynchronize(stream)); for (size_t i(0); i < rows; ++i) { for (size_t j(0); j < cols; ++j) { - auto idx = i * cols + j; // row major assumption! - auto exp_idx = exp_idx_h.get()[idx]; - auto act_idx = act_idx_h.get()[idx]; + auto idx = i * cols + j; // row major assumption! + auto exp_idx = exp_idx_h.get()[idx]; + auto act_idx = act_idx_h.get()[idx]; auto exp_dist = exp_dist_h.get()[idx]; auto act_dist = act_dist_h.get()[idx]; idx_dist_pair exp_kvp(exp_idx, exp_dist, raft::CompareApprox(eps)); @@ -85,8 +90,7 @@ testing::AssertionResult devArrMatchKnnPair( if (!(exp_kvp == act_kvp)) { return testing::AssertionFailure() << "actual=" << act_kvp.idx << "," << act_kvp.dist << "!=" - << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i - << "," << j; + << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i << "," << j; } } } @@ -96,26 +100,43 @@ testing::AssertionResult devArrMatchKnnPair( template class FusedL2KNNTest : public ::testing::TestWithParam { protected: - void testBruteForce() { + void testBruteForce() + { cudaStream_t stream = handle_.get_stream(); launchFaissBfknn(); - detail::fusedL2Knn(dim, raft_indices_, raft_distances_, database, - search_queries, num_db_vecs, num_queries, k_, true, true, - stream, metric); + detail::fusedL2Knn(dim, + raft_indices_, + raft_distances_, + database, + search_queries, + num_db_vecs, + num_queries, + k_, + true, + true, + stream, + metric); // verify. - devArrMatchKnnPair(faiss_indices_, raft_indices_, faiss_distances_, - raft_distances_, num_queries, k_, float(0.001), stream); + devArrMatchKnnPair(faiss_indices_, + raft_indices_, + faiss_distances_, + raft_distances_, + num_queries, + k_, + float(0.001), + stream); } - void SetUp() override { - params_ = ::testing::TestWithParam::GetParam(); + void SetUp() override + { + params_ = ::testing::TestWithParam::GetParam(); num_queries = params_.num_queries; num_db_vecs = params_.num_db_vecs; - dim = params_.dim; - k_ = params_.k; - metric = params_.metric_; + dim = params_.dim; + k_ = params_.k; + metric = params_.metric_; cudaStream_t stream = handle_.get_stream(); @@ -133,12 +154,14 @@ class FusedL2KNNTest : public ::testing::TestWithParam { raft::allocate(faiss_distances_, num_queries * k_, stream, true); } - void TearDown() override { + void TearDown() override + { cudaStream_t stream = handle_.get_stream(); raft::deallocate_all(stream); } - void launchFaissBfknn() { + void launchFaissBfknn() + { faiss::MetricType m = detail::build_faiss_metric(metric); faiss::gpu::StandardGpuResources gpu_res; @@ -149,18 +172,18 @@ class FusedL2KNNTest : public ::testing::TestWithParam { gpu_res.setDefaultStream(device, handle_.get_stream()); faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = 0; - args.k = k_; - args.dims = dim; - args.vectors = database; + args.metric = m; + args.metricArg = 0; + args.k = k_; + args.dims = dim; + args.vectors = database; args.vectorsRowMajor = true; - args.numVectors = num_db_vecs; - args.queries = search_queries; + args.numVectors = num_db_vecs; + args.queries = search_queries; args.queriesRowMajor = true; - args.numQueries = num_queries; - args.outDistances = faiss_distances_; - args.outIndices = faiss_indices_; + args.numQueries = num_queries; + args.outDistances = faiss_distances_; + args.outIndices = faiss_indices_; bfKnn(&gpu_res, args); } @@ -171,12 +194,12 @@ class FusedL2KNNTest : public ::testing::TestWithParam { int num_queries; int num_db_vecs; int dim; - T *database; - T *search_queries; - int64_t *raft_indices_; - T *raft_distances_; - int64_t *faiss_indices_; - T *faiss_distances_; + T* database; + T* search_queries; + int64_t* raft_indices_; + T* raft_distances_; + int64_t* faiss_indices_; + T* faiss_distances_; int k_; raft::distance::DistanceType metric; }; @@ -201,8 +224,7 @@ const std::vector inputs = { typedef FusedL2KNNTest FusedL2KNNTestF; TEST_P(FusedL2KNNTestF, FusedBruteForce) { this->testBruteForce(); } -INSTANTIATE_TEST_CASE_P(FusedL2KNNTest, FusedL2KNNTestF, - ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_CASE_P(FusedL2KNNTest, FusedL2KNNTestF, ::testing::ValuesIn(inputs)); } // namespace knn } // namespace spatial diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu index 5a45c45bff..bff7665f83 100644 --- a/cpp/test/spatial/haversine.cu +++ b/cpp/test/spatial/haversine.cu @@ -35,10 +35,13 @@ class HaversineKNNTest : public ::testing::Test { d_ref_I(0, stream), d_ref_D(0, stream), d_pred_I(0, stream), - d_pred_D(0, stream) {} + d_pred_D(0, stream) + { + } protected: - void basicTest() { + void basicTest() + { // Allocate input d_train_inputs.resize(n * d, stream); @@ -51,35 +54,45 @@ class HaversineKNNTest : public ::testing::Test { d_pred_D.resize(n * n, stream); // make testdata on host - std::vector h_train_inputs = { - 0.71113885, -1.29215058, 0.59613176, -2.08048115, - 0.74932804, -1.33634042, 0.51486728, -1.65962873, - 0.53154002, -1.47049808, 0.72891737, -1.54095137}; + std::vector h_train_inputs = {0.71113885, + -1.29215058, + 0.59613176, + -2.08048115, + 0.74932804, + -1.33634042, + 0.51486728, + -1.65962873, + 0.53154002, + -1.47049808, + 0.72891737, + -1.54095137}; h_train_inputs.resize(d_train_inputs.size()); - raft::update_device(d_train_inputs.data(), h_train_inputs.data(), - d_train_inputs.size(), stream); - - std::vector h_res_D = { - 0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, - 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, - 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, - 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, - 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, - 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; + raft::update_device( + d_train_inputs.data(), h_train_inputs.data(), d_train_inputs.size(), stream); + + std::vector h_res_D = {0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595, + 0., 0.36575755, 0.44288665, 0.5170737, 0.59501296, 0.62925595, + 0., 0.05041587, 0.152463, 0.2426416, 0.34925285, 0.59501296, + 0., 0.16461092, 0.2345792, 0.34925285, 0.35749438, 0.36575755, + 0., 0.16461092, 0.20535265, 0.23048252, 0.2426416, 0.5170737, + 0., 0.152463, 0.18767063, 0.20535265, 0.2345792, 0.44288665}; h_res_D.resize(n * n); raft::update_device(d_ref_D.data(), h_res_D.data(), n * n, stream); - std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, - 2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1, - 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; + std::vector h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1, + 3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1}; h_res_I.resize(n * n); - raft::update_device(d_ref_I.data(), h_res_I.data(), n * n, - stream); + raft::update_device(d_ref_I.data(), h_res_I.data(), n * n, stream); - raft::spatial::knn::detail::haversine_knn( - d_pred_I.data(), d_pred_D.data(), d_train_inputs.data(), - d_train_inputs.data(), n, n, k, stream); + raft::spatial::knn::detail::haversine_knn(d_pred_I.data(), + d_pred_D.data(), + d_train_inputs.data(), + d_train_inputs.data(), + n, + n, + k, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -106,11 +119,11 @@ class HaversineKNNTest : public ::testing::Test { typedef HaversineKNNTest HaversineKNNTestF; -TEST_F(HaversineKNNTestF, Fit) { - ASSERT_TRUE(raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n, - raft::CompareApprox(1e-3))); - ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n, - raft::Compare())); +TEST_F(HaversineKNNTestF, Fit) +{ + ASSERT_TRUE( + raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n, raft::CompareApprox(1e-3))); + ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n, raft::Compare())); } } // namespace knn diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu index 35a82b1e53..49e5aaab4b 100644 --- a/cpp/test/spatial/knn.cu +++ b/cpp/test/spatial/knn.cu @@ -36,17 +36,17 @@ struct KNNInputs { std::vector labels; }; -__global__ void build_actual_output(int *output, int n_rows, int k, - const int *idx_labels, - const int64_t *indices) { +__global__ void build_actual_output( + int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices) +{ int element = threadIdx.x + blockDim.x * blockIdx.x; if (element >= n_rows * k) return; output[element] = idx_labels[indices[element]]; } -__global__ void build_expected_output(int *output, int n_rows, int k, - const int *labels) { +__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels) +{ int row = threadIdx.x + blockDim.x * blockIdx.x; if (row >= n_rows) return; @@ -68,23 +68,33 @@ class KNNTest : public ::testing::TestWithParam { search_data_(0, stream), indices_(0, stream), distances_(0, stream), - search_labels_(0, stream) {} + search_labels_(0, stream) + { + } protected: - void testBruteForce() { - raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, - std::cout); + void testBruteForce() + { + raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, std::cout); std::cout << "K: " << k_ << "\n"; - raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, - std::cout); + raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, std::cout); - std::vector input_vec; + std::vector input_vec; std::vector sizes_vec; input_vec.push_back(input_.data()); sizes_vec.push_back(rows_); - brute_force_knn(handle, input_vec, sizes_vec, cols_, search_data_.data(), - rows_, indices_.data(), distances_.data(), k_, true, true); + brute_force_knn(handle, + input_vec, + sizes_vec, + cols_, + search_data_.data(), + rows_, + indices_.data(), + distances_.data(), + k_, + true, + true); build_actual_output<<>>( actual_labels_.data(), rows_, k_, search_labels_.data(), indices_.data()); @@ -92,14 +102,15 @@ class KNNTest : public ::testing::TestWithParam { build_expected_output<<>>( expected_labels_.data(), rows_, k_, search_labels_.data()); - ASSERT_TRUE(devArrMatch(expected_labels_.data(), actual_labels_.data(), - rows_ * k_, raft::Compare())); + ASSERT_TRUE(devArrMatch( + expected_labels_.data(), actual_labels_.data(), rows_ * k_, raft::Compare())); } - void SetUp() override { + void SetUp() override + { rows_ = params_.input.size(); cols_ = params_.input[0].size(); - k_ = params_.k; + k_ = params_.k; actual_labels_.resize(rows_ * k_, stream); expected_labels_.resize(rows_ * k_, stream); @@ -109,20 +120,17 @@ class KNNTest : public ::testing::TestWithParam { distances_.resize(rows_ * k_, stream); search_labels_.resize(rows_, stream); - CUDA_CHECK(cudaMemsetAsync(actual_labels_.data(), 0, - actual_labels_.size() * sizeof(int), stream)); - CUDA_CHECK(cudaMemsetAsync(expected_labels_.data(), 0, - expected_labels_.size() * sizeof(int), stream)); CUDA_CHECK( - cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream)); - CUDA_CHECK(cudaMemsetAsync(search_data_.data(), 0, - search_data_.size() * sizeof(float), stream)); - CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0, - indices_.size() * sizeof(int64_t), stream)); - CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0, - distances_.size() * sizeof(float), stream)); - CUDA_CHECK(cudaMemsetAsync(search_labels_.data(), 0, - search_labels_.size() * sizeof(int), stream)); + cudaMemsetAsync(actual_labels_.data(), 0, actual_labels_.size() * sizeof(int), stream)); + CUDA_CHECK( + cudaMemsetAsync(expected_labels_.data(), 0, expected_labels_.size() * sizeof(int), stream)); + CUDA_CHECK(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream)); + CUDA_CHECK( + cudaMemsetAsync(search_data_.data(), 0, search_data_.size() * sizeof(float), stream)); + CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream)); + CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream)); + CUDA_CHECK( + cudaMemsetAsync(search_labels_.data(), 0, search_labels_.size() * sizeof(int), stream)); std::vector row_major_input; for (std::size_t i = 0; i < params_.input.size(); ++i) { @@ -130,13 +138,13 @@ class KNNTest : public ::testing::TestWithParam { row_major_input.push_back(params_.input[i][j]); } } - rmm::device_buffer input_d = rmm::device_buffer( - row_major_input.data(), row_major_input.size() * sizeof(float), stream); - float *input_ptr = static_cast(input_d.data()); + rmm::device_buffer input_d = + rmm::device_buffer(row_major_input.data(), row_major_input.size() * sizeof(float), stream); + float* input_ptr = static_cast(input_d.data()); - rmm::device_buffer labels_d = rmm::device_buffer( - params_.labels.data(), params_.labels.size() * sizeof(int), stream); - int *labels_ptr = static_cast(labels_d.data()); + rmm::device_buffer labels_d = + rmm::device_buffer(params_.labels.data(), params_.labels.size() * sizeof(int), stream); + int* labels_ptr = static_cast(labels_d.data()); raft::copy(input_.data(), input_ptr, rows_ * cols_, stream); raft::copy(search_data_.data(), input_ptr, rows_ * cols_, stream); diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu index 7742b9bd30..ad6d1e58d1 100644 --- a/cpp/test/spatial/selection.cu +++ b/cpp/test/spatial/selection.cu @@ -45,8 +45,9 @@ struct SparseSelectionInputs { }; template -::std::ostream &operator<<( - ::std::ostream &os, const SparseSelectionInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, + const SparseSelectionInputs& dims) +{ return os; } @@ -55,18 +56,20 @@ class SparseSelectionTest : public ::testing::TestWithParam> { public: SparseSelectionTest() - : params(::testing::TestWithParam< - SparseSelectionInputs>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), dists(0, stream), inds(0, stream), out_indices_ref(0, stream), out_dists_ref(0, stream), out_dists(0, stream), - out_indices(0, stream) {} + out_indices(0, stream) + { + } protected: - void make_data() { + void make_data() + { std::vector dists_h = params.dists_h; dists.resize(n_rows * n_cols, stream); @@ -77,36 +80,43 @@ class SparseSelectionTest update_device(dists.data(), dists_h.data(), dists_h.size(), stream); iota_fill(inds.data(), n_rows, n_cols, stream); - std::vector out_dists_ref_h = params.out_dists_ref_h; + std::vector out_dists_ref_h = params.out_dists_ref_h; std::vector out_indices_ref_h = params.out_indices_ref_h; out_indices_ref.resize(out_indices_ref_h.size(), stream); out_dists_ref.resize(out_dists_ref_h.size(), stream); - update_device(out_indices_ref.data(), out_indices_ref_h.data(), - out_indices_ref_h.size(), stream); - update_device(out_dists_ref.data(), out_dists_ref_h.data(), - out_dists_ref_h.size(), stream); + update_device( + out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream); + update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream); } - void SetUp() override { + void SetUp() override + { n_rows = params.n_rows; n_cols = params.n_cols; - k = params.k; + k = params.k; make_data(); - raft::spatial::knn::select_k(dists.data(), inds.data(), n_rows, n_cols, - out_dists.data(), out_indices.data(), - params.select_min, k, stream); + raft::spatial::knn::select_k(dists.data(), + inds.data(), + n_rows, + n_cols, + out_dists.data(), + out_indices.data(), + params.select_min, + k, + stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } - void compare() { - ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, - Compare())); - ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(), - n_rows * k, Compare())); + void compare() + { + ASSERT_TRUE( + devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, Compare())); + ASSERT_TRUE( + devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare())); } protected: @@ -141,7 +151,8 @@ const std::vector> inputs_i32_f = { true}}; typedef SparseSelectionTest SparseSelectionTestF; TEST_P(SparseSelectionTestF, Result) { compare(); } -INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF, +INSTANTIATE_TEST_CASE_P(SparseSelectionTest, + SparseSelectionTestF, ::testing::ValuesIn(inputs_i32_f)); }; // end namespace selection diff --git a/cpp/test/spatial/spatial_data.h b/cpp/test/spatial/spatial_data.h index 87891164fc..dbb32c4546 100644 --- a/cpp/test/spatial/spatial_data.h +++ b/cpp/test/spatial/spatial_data.h @@ -5,23 +5,18 @@ namespace spatial { // Latitude and longitude coordinates of 51 US states / territories std::vector spatial_data = { - 63.588753, -154.493062, 32.318231, -86.902298, 35.20105, -91.831833, - 34.048928, -111.093731, 36.778261, -119.417932, 39.550051, -105.782067, - 41.603221, -73.087749, 38.905985, -77.033418, 38.910832, -75.52767, - 27.664827, -81.515754, 32.157435, -82.907123, 19.898682, -155.665857, - 41.878003, -93.097702, 44.068202, -114.742041, 40.633125, -89.398528, - 40.551217, -85.602364, 39.011902, -98.484246, 37.839333, -84.270018, - 31.244823, -92.145024, 42.407211, -71.382437, 39.045755, -76.641271, - 45.253783, -69.445469, 44.314844, -85.602364, 46.729553, -94.6859, - 37.964253, -91.831833, 32.354668, -89.398528, 46.879682, -110.362566, - 35.759573, -79.0193, 47.551493, -101.002012, 41.492537, -99.901813, - 43.193852, -71.572395, 40.058324, -74.405661, 34.97273, -105.032363, - 38.80261, -116.419389, 43.299428, -74.217933, 40.417287, -82.907123, - 35.007752, -97.092877, 43.804133, -120.554201, 41.203322, -77.194525, - 18.220833, -66.590149, 41.580095, -71.477429, 33.836081, -81.163725, - 43.969515, -99.901813, 35.517491, -86.580447, 31.968599, -99.901813, - 39.32098, -111.093731, 37.431573, -78.656894, 44.558803, -72.577841, - 47.751074, -120.740139, 43.78444, -88.787868, 38.597626, -80.454903, - 43.075968, -107.290284}; + 63.588753, -154.493062, 32.318231, -86.902298, 35.20105, -91.831833, 34.048928, -111.093731, + 36.778261, -119.417932, 39.550051, -105.782067, 41.603221, -73.087749, 38.905985, -77.033418, + 38.910832, -75.52767, 27.664827, -81.515754, 32.157435, -82.907123, 19.898682, -155.665857, + 41.878003, -93.097702, 44.068202, -114.742041, 40.633125, -89.398528, 40.551217, -85.602364, + 39.011902, -98.484246, 37.839333, -84.270018, 31.244823, -92.145024, 42.407211, -71.382437, + 39.045755, -76.641271, 45.253783, -69.445469, 44.314844, -85.602364, 46.729553, -94.6859, + 37.964253, -91.831833, 32.354668, -89.398528, 46.879682, -110.362566, 35.759573, -79.0193, + 47.551493, -101.002012, 41.492537, -99.901813, 43.193852, -71.572395, 40.058324, -74.405661, + 34.97273, -105.032363, 38.80261, -116.419389, 43.299428, -74.217933, 40.417287, -82.907123, + 35.007752, -97.092877, 43.804133, -120.554201, 41.203322, -77.194525, 18.220833, -66.590149, + 41.580095, -71.477429, 33.836081, -81.163725, 43.969515, -99.901813, 35.517491, -86.580447, + 31.968599, -99.901813, 39.32098, -111.093731, 37.431573, -78.656894, 44.558803, -72.577841, + 47.751074, -120.740139, 43.78444, -88.787868, 38.597626, -80.454903, 43.075968, -107.290284}; }; // namespace spatial }; // namespace raft \ No newline at end of file diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu index 388ad56f2d..fa54b04cda 100644 --- a/cpp/test/spectral_matrix.cu +++ b/cpp/test/spectral_matrix.cu @@ -32,7 +32,8 @@ struct csr_view_t { index_type number_of_edges; }; } // namespace -TEST(Raft, SpectralMatrices) { +TEST(Raft, SpectralMatrices) +{ using namespace matrix; using index_type = int; using value_type = double; @@ -48,7 +49,7 @@ TEST(Raft, SpectralMatrices) { index_type* ro{nullptr}; index_type* ci{nullptr}; value_type* vs{nullptr}; - index_type nnz = 0; + index_type nnz = 0; index_type nrows = 0; sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; sparse_matrix_t sm2{h, csr_v}; @@ -62,9 +63,7 @@ TEST(Raft, SpectralMatrices) { }; EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args - auto cnstr_lm2 = [&h, &sm2](void) { - laplacian_matrix_t lm2{h, sm2}; - }; + auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t lm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) { @@ -72,9 +71,7 @@ TEST(Raft, SpectralMatrices) { }; EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args - auto cnstr_mm2 = [&h, &sm2](void) { - modularity_matrix_t mm2{h, sm2}; - }; + auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t mm2{h, sm2}; }; EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args } diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index cf866a5663..b8ea2cb799 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -35,7 +35,8 @@ struct MeanInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const MeanInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MeanInputs& dims) +{ return os; } @@ -48,20 +49,23 @@ class MeanTest : public ::testing::TestWithParam> { rows(params.rows), cols(params.cols), data(rows * cols, stream), - mean_act(rows * cols, stream) {} + mean_act(rows * cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); int len = rows * cols; r.normal(data.data(), len, params.mean, (T)1.0, stream); meanSGtest(data.data(), stream); } - void meanSGtest(T *data, cudaStream_t stream) { + void meanSGtest(T* data, cudaStream_t stream) + { int rows = params.rows, cols = params.cols; - mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, - stream); + mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, stream); } protected: @@ -76,52 +80,52 @@ class MeanTest : public ::testing::TestWithParam> { // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the // measured mean (of a normal distribution) will fall outside of an epsilon of // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times) -const std::vector> inputsf = { - {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, - {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; - -const std::vector> inputsd = { - {0.15, 1.0, 1024, 32, true, false, 1234ULL}, - {0.15, 1.0, 1024, 64, true, false, 1234ULL}, - {0.15, 1.0, 1024, 128, true, false, 1234ULL}, - {0.15, 1.0, 1024, 256, true, false, 1234ULL}, - {0.15, -1.0, 1024, 32, false, false, 1234ULL}, - {0.15, -1.0, 1024, 64, false, false, 1234ULL}, - {0.15, -1.0, 1024, 128, false, false, 1234ULL}, - {0.15, -1.0, 1024, 256, false, false, 1234ULL}, - {0.15, 1.0, 1024, 32, true, true, 1234ULL}, - {0.15, 1.0, 1024, 64, true, true, 1234ULL}, - {0.15, 1.0, 1024, 128, true, true, 1234ULL}, - {0.15, 1.0, 1024, 256, true, true, 1234ULL}, - {0.15, -1.0, 1024, 32, false, true, 1234ULL}, - {0.15, -1.0, 1024, 64, false, true, 1234ULL}, - {0.15, -1.0, 1024, 128, false, true, 1234ULL}, - {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; +const std::vector> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, + {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, true, 1234ULL}}; + +const std::vector> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL}, + {0.15, 1.0, 1024, 64, true, false, 1234ULL}, + {0.15, 1.0, 1024, 128, true, false, 1234ULL}, + {0.15, 1.0, 1024, 256, true, false, 1234ULL}, + {0.15, -1.0, 1024, 32, false, false, 1234ULL}, + {0.15, -1.0, 1024, 64, false, false, 1234ULL}, + {0.15, -1.0, 1024, 128, false, false, 1234ULL}, + {0.15, -1.0, 1024, 256, false, false, 1234ULL}, + {0.15, 1.0, 1024, 32, true, true, 1234ULL}, + {0.15, 1.0, 1024, 64, true, true, 1234ULL}, + {0.15, 1.0, 1024, 128, true, true, 1234ULL}, + {0.15, 1.0, 1024, 256, true, true, 1234ULL}, + {0.15, -1.0, 1024, 32, false, true, 1234ULL}, + {0.15, -1.0, 1024, 64, false, true, 1234ULL}, + {0.15, -1.0, 1024, 128, false, true, 1234ULL}, + {0.15, -1.0, 1024, 256, false, true, 1234ULL}}; typedef MeanTest MeanTestF; -TEST_P(MeanTestF, Result) { - ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols, - CompareApprox(params.tolerance))); +TEST_P(MeanTestF, Result) +{ + ASSERT_TRUE( + devArrMatch(params.mean, mean_act.data(), params.cols, CompareApprox(params.tolerance))); } typedef MeanTest MeanTestD; -TEST_P(MeanTestD, Result) { - ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols, - CompareApprox(params.tolerance))); +TEST_P(MeanTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + params.mean, mean_act.data(), params.cols, CompareApprox(params.tolerance))); } INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index dcc4b4e551..6a76a289d7 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -34,37 +34,49 @@ struct MeanCenterInputs { }; template -::std::ostream &operator<<(::std::ostream &os, - const MeanCenterInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs& dims) +{ return os; } template -class MeanCenterTest - : public ::testing::TestWithParam> { +class MeanCenterTest : public ::testing::TestWithParam> { public: MeanCenterTest() - : params( - ::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), rows(params.rows), cols(params.cols), out(rows * cols, stream), out_ref(rows * cols, stream), data(rows * cols, stream), - meanVec(params.bcastAlongRows ? cols : rows, stream) {} + meanVec(params.bcastAlongRows ? cols : rows, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { raft::random::Rng r(params.seed); auto len = rows * cols; r.normal(data.data(), len, params.mean, (T)1.0, stream); - raft::stats::mean(meanVec.data(), data.data(), cols, rows, params.sample, - params.rowMajor, stream); - meanCenter(out.data(), data.data(), meanVec.data(), cols, rows, - params.rowMajor, params.bcastAlongRows, stream); - raft::linalg::naiveMatVec(out_ref.data(), data.data(), meanVec.data(), cols, - rows, params.rowMajor, params.bcastAlongRows, + raft::stats::mean( + meanVec.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream); + meanCenter(out.data(), + data.data(), + meanVec.data(), + cols, + rows, + params.rowMajor, + params.bcastAlongRows, + stream); + raft::linalg::naiveMatVec(out_ref.data(), + data.data(), + meanVec.data(), + cols, + rows, + params.rowMajor, + params.bcastAlongRows, (T)-1.0); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -104,12 +116,12 @@ const std::vector> inputsf_i32 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i32; -TEST_P(MeanCenterTestF_i32, Result) { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i32, Result) +{ + ASSERT_TRUE(devArrMatch( + out.data(), out_ref.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, - ::testing::ValuesIn(inputsf_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL}, @@ -137,12 +149,12 @@ const std::vector> inputsf_i64 = { {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i64; -TEST_P(MeanCenterTestF_i64, Result) { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestF_i64, Result) +{ + ASSERT_TRUE(devArrMatch( + out.data(), out_ref.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, - ::testing::ValuesIn(inputsf_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -170,12 +182,12 @@ const std::vector> inputsd_i32 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i32; -TEST_P(MeanCenterTestD_i32, Result) { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i32, Result) +{ + ASSERT_TRUE(devArrMatch( + out.data(), out_ref.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, - ::testing::ValuesIn(inputsd_i32)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, @@ -203,12 +215,12 @@ const std::vector> inputsd_i64 = { {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i64; -TEST_P(MeanCenterTestD_i64, Result) { - ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(MeanCenterTestD_i64, Result) +{ + ASSERT_TRUE(devArrMatch( + out.data(), out_ref.data(), params.cols, raft::CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, - ::testing::ValuesIn(inputsd_i64)); +INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index 53f392aaf3..3efc54264e 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -34,7 +34,8 @@ struct StdDevInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const StdDevInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const StdDevInputs& dims) +{ return os; } @@ -49,10 +50,13 @@ class StdDevTest : public ::testing::TestWithParam> { data(rows * cols, stream), mean_act(cols, stream), stddev_act(cols, stream), - vars_act(cols, stream) {} + vars_act(cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { random::Rng r(params.seed); int len = rows * cols; @@ -65,17 +69,17 @@ class StdDevTest : public ::testing::TestWithParam> { CUDA_CHECK(cudaStreamSynchronize(stream)); } - void stdVarSGtest(T *data, cudaStream_t stream) { + void stdVarSGtest(T* data, cudaStream_t stream) + { int rows = params.rows, cols = params.cols; - mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, - stream); + mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, stream); - stddev(stddev_act.data(), data, mean_act.data(), cols, rows, params.sample, - params.rowMajor, stream); + stddev( + stddev_act.data(), data, mean_act.data(), cols, rows, params.sample, params.rowMajor, stream); - vars(vars_act.data(), data, mean_act.data(), cols, rows, params.sample, - params.rowMajor, stream); + vars( + vars_act.data(), data, mean_act.data(), cols, rows, params.sample, params.rowMajor, stream); raft::matrix::seqRoot(vars_act.data(), T(1), cols, stream); } @@ -126,28 +130,28 @@ const std::vector> inputsd = { {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}}; typedef StdDevTest StdDevTestF; -TEST_P(StdDevTestF, Result) { - ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols, - CompareApprox(params.tolerance))); +TEST_P(StdDevTestF, Result) +{ + ASSERT_TRUE(devArrMatch( + params.stddev, stddev_act.data(), params.cols, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + stddev_act.data(), vars_act.data(), params.cols, CompareApprox(params.tolerance))); } typedef StdDevTest StdDevTestD; -TEST_P(StdDevTestD, Result) { - ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols, - CompareApprox(params.tolerance))); +TEST_P(StdDevTestD, Result) +{ + ASSERT_TRUE(devArrMatch( + params.stddev, stddev_act.data(), params.cols, CompareApprox(params.tolerance))); - ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols, - CompareApprox(params.tolerance))); + ASSERT_TRUE(devArrMatch( + stddev_act.data(), vars_act.data(), params.cols, CompareApprox(params.tolerance))); } -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, - ::testing::ValuesIn(inputsf)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf)); -INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, - ::testing::ValuesIn(inputsd)); +INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, ::testing::ValuesIn(inputsd)); } // end namespace stats } // end namespace raft diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index ac4d642c8e..ecb1171ea5 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -32,7 +32,8 @@ struct SumInputs { }; template -::std::ostream &operator<<(::std::ostream &os, const SumInputs &dims) { +::std::ostream& operator<<(::std::ostream& os, const SumInputs& dims) +{ return os; } @@ -45,10 +46,13 @@ class SumTest : public ::testing::TestWithParam> { rows(params.rows), cols(params.cols), data(rows * cols, stream), - sum_act(cols, stream) {} + sum_act(cols, stream) + { + } protected: - void SetUp() override { + void SetUp() override + { int len = rows * cols; T data_h[len]; @@ -77,14 +81,17 @@ const std::vector> inputsd = {{0.05, 1024, 32, 1234ULL}, {0.05, 1024, 256, 1234ULL}}; typedef SumTest SumTestF; -TEST_P(SumTestF, Result) { - ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act.data(), params.cols, - raft::CompareApprox(params.tolerance))); +TEST_P(SumTestF, Result) +{ + ASSERT_TRUE(raft::devArrMatch( + float(params.rows), sum_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } typedef SumTest SumTestD; -TEST_P(SumTestD, Result) { - ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act.data(), +TEST_P(SumTestD, Result) +{ + ASSERT_TRUE(raft::devArrMatch(double(params.rows), + sum_act.data(), params.cols, raft::CompareApprox(params.tolerance))); } diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h index 0f135c0121..58b9ae42ae 100644 --- a/cpp/test/test_utils.h +++ b/cpp/test/test_utils.h @@ -32,15 +32,16 @@ namespace raft { template struct Compare { - bool operator()(const T &a, const T &b) const { return a == b; } + bool operator()(const T& a, const T& b) const { return a == b; } }; template struct CompareApprox { CompareApprox(T eps_) : eps(eps_) {} - bool operator()(const T &a, const T &b) const { - T diff = abs(a - b); - T m = std::max(abs(a), abs(b)); + bool operator()(const T& a, const T& b) const + { + T diff = abs(a - b); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); @@ -53,9 +54,10 @@ struct CompareApprox { template struct CompareApproxAbs { CompareApproxAbs(T eps_) : eps(eps_) {} - bool operator()(const T &a, const T &b) const { - T diff = abs(abs(a) - abs(b)); - T m = std::max(abs(a), abs(b)); + bool operator()(const T& a, const T& b) const + { + T diff = abs(abs(a) - abs(b)); + T m = std::max(abs(a), abs(b)); T ratio = diff >= eps ? diff / m : diff; return (ratio <= eps); } @@ -65,25 +67,26 @@ struct CompareApproxAbs { }; template -T abs(const T &a) { +T abs(const T& a) +{ return a > T(0) ? a : -a; } /* - * @brief Helper function to compare 2 device n-D arrays with custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value(s) - * @param actual actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - * @{ - */ + * @brief Helper function to compare 2 device n-D arrays with custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value(s) + * @param actual actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + * @{ + */ template -testing::AssertionResult devArrMatch(const T *expected, const T *actual, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); raft::update_host(exp_h.get(), expected, size, stream); @@ -93,16 +96,16 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, auto exp = exp_h.get()[i]; auto act = act_h.get()[i]; if (!eq_compare(exp, act)) { - return testing::AssertionFailure() - << "actual=" << act << " != expected=" << exp << " @" << i; + return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i; } } return testing::AssertionSuccess(); } template -testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, - L eq_compare, cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -117,9 +120,13 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size, } template -testing::AssertionResult devArrMatch(const T *expected, const T *actual, - size_t rows, size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch(const T* expected, + const T* actual, + size_t rows, + size_t cols, + L eq_compare, + cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr exp_h(new T[size]); std::unique_ptr act_h(new T[size]); @@ -133,8 +140,7 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, auto act = act_h.get()[idx]; if (!eq_compare(exp, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << exp << " @" << i << "," - << j; + << "actual=" << act << " != expected=" << exp << " @" << i << "," << j; } } } @@ -142,9 +148,9 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual, } template -testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, - size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatch( + T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -155,8 +161,7 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i - << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; } } } @@ -164,24 +169,24 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows, } /* - * @brief Helper function to compare a device n-D arrays with an expected array - * on the host, using a custom comparison - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected_h host array of expected value(s) - * @param actual_d device array actual values - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare a device n-D arrays with an expected array + * on the host, using a custom comparison + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected_h host array of expected value(s) + * @param actual_d device array actual values + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, - size_t size, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult devArrMatchHost( + const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0) +{ std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual_d, size, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); - bool ok = true; + bool ok = true; auto fail = testing::AssertionFailure(); for (size_t i(0); i < size; ++i) { auto exp = expected_h[i]; @@ -196,19 +201,19 @@ testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d, } /* - * @brief Helper function to compare diagonal values of a 2D matrix - * @tparam T the data type of the arrays - * @tparam L the comparator lambda or object function - * @param expected expected value along diagonal - * @param actual actual matrix - * @param eq_compare the comparator - * @param stream cuda stream - * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE - */ + * @brief Helper function to compare diagonal values of a 2D matrix + * @tparam T the data type of the arrays + * @tparam L the comparator lambda or object function + * @param expected expected value along diagonal + * @param actual actual matrix + * @param eq_compare the comparator + * @param stream cuda stream + * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE + */ template -testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, - size_t cols, L eq_compare, - cudaStream_t stream = 0) { +testing::AssertionResult diagonalMatch( + T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0) +{ size_t size = rows * cols; std::unique_ptr act_h(new T[size]); raft::update_host(act_h.get(), actual, size, stream); @@ -220,8 +225,7 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, auto act = act_h.get()[idx]; if (!eq_compare(expected, act)) { return testing::AssertionFailure() - << "actual=" << act << " != expected=" << expected << " @" << i - << "," << j; + << "actual=" << act << " != expected=" << expected << " @" << i << "," << j; } } } @@ -229,10 +233,10 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows, } template -testing::AssertionResult match(const T expected, T actual, L eq_compare) { +testing::AssertionResult match(const T expected, T actual, L eq_compare) +{ if (!eq_compare(expected, actual)) { - return testing::AssertionFailure() - << "actual=" << actual << " != expected=" << expected; + return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected; } return testing::AssertionSuccess(); } @@ -256,8 +260,8 @@ testing::AssertionResult match(const T expected, T actual, L eq_compare) { ms /= args.runs; \ } while (0) -inline std::vector read_csv(std::string filename, - bool skip_first_n_columns = 1) { +inline std::vector read_csv(std::string filename, bool skip_first_n_columns = 1) +{ std::vector result; std::ifstream myFile(filename); if (!myFile.is_open()) throw std::runtime_error("Could not open file"); @@ -268,8 +272,7 @@ inline std::vector read_csv(std::string filename, if (myFile.good()) { std::getline(myFile, line); std::stringstream ss(line); - while (std::getline(ss, colname, ',')) { - } + while (std::getline(ss, colname, ',')) {} } int n_lines = 0; From d7b4f0adf2a4f13ddcc7b7072ad0123137182f96 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra Date: Wed, 24 Nov 2021 18:09:39 -0500 Subject: [PATCH 5/5] Missed change --- cpp/include/raft/spectral/partition.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp index 88cc8aa8f0..b52bfcc0d6 100644 --- a/cpp/include/raft/spectral/partition.hpp +++ b/cpp/include/raft/spectral/partition.hpp @@ -79,7 +79,7 @@ std::tuple partition(handle_t const& handle, std::tuple stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, - //cluster solver residual, # iters cluster solver + // cluster solver residual, # iters cluster solver vertex_t n = csr_m.nrows_;